Unverified Commit 3e62e60b authored by liuzhe-lz's avatar liuzhe-lz Committed by GitHub
Browse files

Refactor web UI to support incremental metric loading (#1557)

* Refactor web UI to support incremental metric loading

* refactor

* Remove host job

* Move sequence ID to NNI manager

* implement incremental loading
parent 99f7d79c
...@@ -30,7 +30,6 @@ class ExperimentStartupInfo { ...@@ -30,7 +30,6 @@ class ExperimentStartupInfo {
private newExperiment: boolean = true; private newExperiment: boolean = true;
private basePort: number = -1; private basePort: number = -1;
private initialized: boolean = false; private initialized: boolean = false;
private initTrialSequenceID: number = 0;
private logDir: string = ''; private logDir: string = '';
private logLevel: string = ''; private logLevel: string = '';
private readonly: boolean = false; private readonly: boolean = false;
...@@ -93,17 +92,6 @@ class ExperimentStartupInfo { ...@@ -93,17 +92,6 @@ class ExperimentStartupInfo {
return this.readonly; return this.readonly;
} }
public setInitTrialSequenceId(initSequenceId: number): void {
assert(this.initialized);
this.initTrialSequenceID = initSequenceId;
}
public getInitTrialSequenceId(): number {
assert(this.initialized);
return this.initTrialSequenceID;
}
} }
function getExperimentId(): string { function getExperimentId(): string {
...@@ -118,14 +106,6 @@ function isNewExperiment(): boolean { ...@@ -118,14 +106,6 @@ function isNewExperiment(): boolean {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).isNewExperiment(); return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).isNewExperiment();
} }
function setInitTrialSequenceId(initSequenceId: number): void {
component.get<ExperimentStartupInfo>(ExperimentStartupInfo).setInitTrialSequenceId(initSequenceId);
}
function getInitTrialSequenceId(): number {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).getInitTrialSequenceId();
}
function getExperimentStartupInfo(): ExperimentStartupInfo { function getExperimentStartupInfo(): ExperimentStartupInfo {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo); return component.get<ExperimentStartupInfo>(ExperimentStartupInfo);
} }
...@@ -141,4 +121,4 @@ function isReadonly(): boolean { ...@@ -141,4 +121,4 @@ function isReadonly(): boolean {
} }
export { ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment, getExperimentStartupInfo, export { ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment, getExperimentStartupInfo,
setExperimentStartupInfo, setInitTrialSequenceId, getInitTrialSequenceId, isReadonly }; setExperimentStartupInfo, isReadonly };
...@@ -83,7 +83,7 @@ interface ExperimentProfile { ...@@ -83,7 +83,7 @@ interface ExperimentProfile {
logDir?: string; logDir?: string;
startTime?: number; startTime?: number;
endTime?: number; endTime?: number;
maxSequenceId: number; nextSequenceId: number;
revision: number; revision: number;
} }
...@@ -115,6 +115,9 @@ abstract class Manager { ...@@ -115,6 +115,9 @@ abstract class Manager {
public abstract getClusterMetadata(key: string): Promise<string>; public abstract getClusterMetadata(key: string): Promise<string>;
public abstract getMetricData(trialJobId?: string, metricType?: MetricType): Promise<MetricDataRecord[]>; public abstract getMetricData(trialJobId?: string, metricType?: MetricType): Promise<MetricDataRecord[]>;
public abstract getMetricDataByRange(minSeqId: number, maxSeqId: number): Promise<MetricDataRecord[]>;
public abstract getLatestMetricData(): Promise<MetricDataRecord[]>;
public abstract getTrialJobStatistics(): Promise<TrialJobStatistics[]>; public abstract getTrialJobStatistics(): Promise<TrialJobStatistics[]>;
public abstract getStatus(): NNIManagerStatus; public abstract getStatus(): NNIManagerStatus;
} }
......
...@@ -23,20 +23,12 @@ ...@@ -23,20 +23,12 @@
* define TrialJobStatus * define TrialJobStatus
*/ */
type TrialJobStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED' | 'SYS_CANCELED' | 'EARLY_STOPPED'; type TrialJobStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED' | 'SYS_CANCELED' | 'EARLY_STOPPED';
type JobType = 'TRIAL' | 'HOST';
interface TrainingServiceMetadata { interface TrainingServiceMetadata {
readonly key: string; readonly key: string;
readonly value: string; readonly value: string;
} }
/**
* define JobApplicationForm
*/
interface JobApplicationForm {
readonly jobType: JobType;
}
interface HyperParameters { interface HyperParameters {
readonly value: string; readonly value: string;
readonly index: number; readonly index: number;
...@@ -45,18 +37,11 @@ interface HyperParameters { ...@@ -45,18 +37,11 @@ interface HyperParameters {
/** /**
* define TrialJobApplicationForm * define TrialJobApplicationForm
*/ */
interface TrialJobApplicationForm extends JobApplicationForm { interface TrialJobApplicationForm {
readonly sequenceId: number;
readonly hyperParameters: HyperParameters; readonly hyperParameters: HyperParameters;
} }
/**
* define HostJobApplicationForm
*/
interface HostJobApplicationForm extends JobApplicationForm {
readonly host: string;
readonly cmd: string;
}
/** /**
* define TrialJobDetail * define TrialJobDetail
*/ */
...@@ -69,8 +54,7 @@ interface TrialJobDetail { ...@@ -69,8 +54,7 @@ interface TrialJobDetail {
readonly tags?: string[]; readonly tags?: string[];
readonly url?: string; readonly url?: string;
readonly workingDirectory: string; readonly workingDirectory: string;
readonly form: JobApplicationForm; readonly form: TrialJobApplicationForm;
readonly sequenceId: number;
isEarlyStopped?: boolean; isEarlyStopped?: boolean;
} }
...@@ -112,8 +96,8 @@ abstract class TrainingService { ...@@ -112,8 +96,8 @@ abstract class TrainingService {
public abstract getTrialJob(trialJobId: string): Promise<TrialJobDetail>; public abstract getTrialJob(trialJobId: string): Promise<TrialJobDetail>;
public abstract addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void; public abstract addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void;
public abstract removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void; public abstract removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void;
public abstract submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail>; public abstract submitTrialJob(form: TrialJobApplicationForm): Promise<TrialJobDetail>;
public abstract updateTrialJob(trialJobId: string, form: JobApplicationForm): Promise<TrialJobDetail>; public abstract updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise<TrialJobDetail>;
public abstract get isMultiPhaseJobSupported(): boolean; public abstract get isMultiPhaseJobSupported(): boolean;
public abstract cancelTrialJob(trialJobId: string, isEarlyStopped?: boolean): Promise<void>; public abstract cancelTrialJob(trialJobId: string, isEarlyStopped?: boolean): Promise<void>;
public abstract setClusterMetadata(key: string, value: string): Promise<void>; public abstract setClusterMetadata(key: string, value: string): Promise<void>;
...@@ -135,5 +119,5 @@ class NNIManagerIpConfig { ...@@ -135,5 +119,5 @@ class NNIManagerIpConfig {
export { export {
TrainingService, TrainingServiceError, TrialJobStatus, TrialJobApplicationForm, TrainingService, TrainingServiceError, TrialJobStatus, TrialJobApplicationForm,
TrainingServiceMetadata, TrialJobDetail, TrialJobMetric, HyperParameters, TrainingServiceMetadata, TrialJobDetail, TrialJobMetric, HyperParameters,
HostJobApplicationForm, JobApplicationForm, JobType, NNIManagerIpConfig NNIManagerIpConfig
}; };
...@@ -26,7 +26,7 @@ import { Deferred } from 'ts-deferred'; ...@@ -26,7 +26,7 @@ import { Deferred } from 'ts-deferred';
import * as component from '../common/component'; import * as component from '../common/component';
import { DataStore, MetricDataRecord, MetricType, TrialJobInfo } from '../common/datastore'; import { DataStore, MetricDataRecord, MetricType, TrialJobInfo } from '../common/datastore';
import { NNIError } from '../common/errors'; import { NNIError } from '../common/errors';
import { getExperimentId, setInitTrialSequenceId } from '../common/experimentStartupInfo'; import { getExperimentId } from '../common/experimentStartupInfo';
import { getLogger, Logger } from '../common/log'; import { getLogger, Logger } from '../common/log';
import { import {
ExperimentParams, ExperimentProfile, Manager, ExperimentStatus, ExperimentParams, ExperimentProfile, Manager, ExperimentStatus,
...@@ -204,7 +204,6 @@ class NNIManager implements Manager { ...@@ -204,7 +204,6 @@ class NNIManager implements Manager {
return Promise.resolve(); return Promise.resolve();
} }
const expParams: ExperimentParams = this.experimentProfile.params; const expParams: ExperimentParams = this.experimentProfile.params;
setInitTrialSequenceId(this.experimentProfile.maxSequenceId + 1);
// Set up multiphase config // Set up multiphase config
if (expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) { if (expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) {
...@@ -301,6 +300,37 @@ class NNIManager implements Manager { ...@@ -301,6 +300,37 @@ class NNIManager implements Manager {
return this.dataStore.getMetricData(trialJobId, metricType); return this.dataStore.getMetricData(trialJobId, metricType);
} }
public async getMetricDataByRange(minSeqId: number, maxSeqId: number): Promise<MetricDataRecord[]> {
const trialJobs = await this.dataStore.listTrialJobs();
const targetTrials = trialJobs.filter(trial => (
// FIXME: can this be undefined?
trial.sequenceId !== undefined && minSeqId <= trial.sequenceId && trial.sequenceId <= maxSeqId
));
const targetTrialIds = new Set(targetTrials.map(trial => trial.id));
const allMetrics = await this.dataStore.getMetricData();
return allMetrics.filter(metric => targetTrialIds.has(metric.trialJobId));
}
public async getLatestMetricData(): Promise<MetricDataRecord[]> {
// FIXME: this can take a long time
const allMetrics: MetricDataRecord[] = await this.dataStore.getMetricData();
const finals: MetricDataRecord[] = [];
const latestIntermediates: Map<string, MetricDataRecord> = new Map<string, MetricDataRecord>();
for (const metric of allMetrics) {
if (metric.type !== 'PERIODICAL') {
finals.push(metric);
} else {
const old: MetricDataRecord | undefined = latestIntermediates.get(metric.trialJobId);
if (old === undefined || old.sequence <= metric.sequence) {
latestIntermediates.set(metric.trialJobId, metric);
}
}
}
return finals.concat(Array.from(latestIntermediates.values()));
// FIXME: unit test
}
public getExperimentProfile(): Promise<ExperimentProfile> { public getExperimentProfile(): Promise<ExperimentProfile> {
// TO DO: using Promise.resolve() // TO DO: using Promise.resolve()
const deferred: Deferred<ExperimentProfile> = new Deferred<ExperimentProfile>(); const deferred: Deferred<ExperimentProfile> = new Deferred<ExperimentProfile>();
...@@ -456,11 +486,7 @@ class NNIManager implements Manager { ...@@ -456,11 +486,7 @@ class NNIManager implements Manager {
case 'EARLY_STOPPED': case 'EARLY_STOPPED':
this.trialJobs.delete(trialJobId); this.trialJobs.delete(trialJobId);
finishedTrialJobNum++; finishedTrialJobNum++;
if (trialJobDetail.form.jobType === 'TRIAL') { hyperParams = trialJobDetail.form.hyperParameters.value;
hyperParams = (<TrialJobApplicationForm>trialJobDetail.form).hyperParameters.value;
} else {
throw new Error('Error: jobType error, not TRIAL');
}
this.dispatcher.sendCommand(TRIAL_END, JSON.stringify({ this.dispatcher.sendCommand(TRIAL_END, JSON.stringify({
trial_job_id: trialJobDetail.id, trial_job_id: trialJobDetail.id,
event: trialJobDetail.status, event: trialJobDetail.status,
...@@ -473,11 +499,7 @@ class NNIManager implements Manager { ...@@ -473,11 +499,7 @@ class NNIManager implements Manager {
// TO DO: push this job to queue for retry // TO DO: push this job to queue for retry
this.trialJobs.delete(trialJobId); this.trialJobs.delete(trialJobId);
finishedTrialJobNum++; finishedTrialJobNum++;
if (trialJobDetail.form.jobType === 'TRIAL') { hyperParams = trialJobDetail.form.hyperParameters.value;
hyperParams = (<TrialJobApplicationForm>trialJobDetail.form).hyperParameters.value;
} else {
throw new Error('Error: jobType error, not TRIAL');
}
this.dispatcher.sendCommand(TRIAL_END, JSON.stringify({ this.dispatcher.sendCommand(TRIAL_END, JSON.stringify({
trial_job_id: trialJobDetail.id, trial_job_id: trialJobDetail.id,
event: trialJobDetail.status, event: trialJobDetail.status,
...@@ -576,7 +598,7 @@ class NNIManager implements Manager { ...@@ -576,7 +598,7 @@ class NNIManager implements Manager {
} }
this.currSubmittedTrialNum++; this.currSubmittedTrialNum++;
const trialJobAppForm: TrialJobApplicationForm = { const trialJobAppForm: TrialJobApplicationForm = {
jobType: 'TRIAL', sequenceId: this.experimentProfile.nextSequenceId++,
hyperParameters: { hyperParameters: {
value: hyperParams, value: hyperParams,
index: 0 index: 0
...@@ -584,7 +606,7 @@ class NNIManager implements Manager { ...@@ -584,7 +606,7 @@ class NNIManager implements Manager {
}; };
this.log.info(`submitTrialJob: form: ${JSON.stringify(trialJobAppForm)}`); this.log.info(`submitTrialJob: form: ${JSON.stringify(trialJobAppForm)}`);
const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(trialJobAppForm); const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(trialJobAppForm);
await this.storeMaxSequenceId(trialJobDetail.sequenceId); await this.storeExperimentProfile();
this.trialJobs.set(trialJobDetail.id, Object.assign({}, trialJobDetail)); this.trialJobs.set(trialJobDetail.id, Object.assign({}, trialJobDetail));
const trialJobDetailSnapshot: TrialJobDetail | undefined = this.trialJobs.get(trialJobDetail.id); const trialJobDetailSnapshot: TrialJobDetail | undefined = this.trialJobs.get(trialJobDetail.id);
if (trialJobDetailSnapshot != undefined) { if (trialJobDetailSnapshot != undefined) {
...@@ -703,7 +725,7 @@ class NNIManager implements Manager { ...@@ -703,7 +725,7 @@ class NNIManager implements Manager {
assert(tunerCommand.trial_job_id !== undefined); assert(tunerCommand.trial_job_id !== undefined);
const trialJobForm: TrialJobApplicationForm = { const trialJobForm: TrialJobApplicationForm = {
jobType: 'TRIAL', sequenceId: -1, // FIXME: multi-phase tuner should use sequence ID instead of trial job ID
hyperParameters: { hyperParameters: {
value: content, value: content,
index: tunerCommand.parameter_index index: tunerCommand.parameter_index
...@@ -757,7 +779,7 @@ class NNIManager implements Manager { ...@@ -757,7 +779,7 @@ class NNIManager implements Manager {
revision: 0, revision: 0,
execDuration: 0, execDuration: 0,
logDir: getExperimentRootDir(), logDir: getExperimentRootDir(),
maxSequenceId: 0, nextSequenceId: 0,
params: { params: {
authorName: '', authorName: '',
experimentName: '', experimentName: '',
...@@ -788,13 +810,6 @@ class NNIManager implements Manager { ...@@ -788,13 +810,6 @@ class NNIManager implements Manager {
return Promise.resolve(chkpDir); return Promise.resolve(chkpDir);
} }
private async storeMaxSequenceId(sequenceId: number): Promise<void> {
if (sequenceId > this.experimentProfile.maxSequenceId) {
this.experimentProfile.maxSequenceId = sequenceId;
await this.storeExperimentProfile();
}
}
} }
export { NNIManager }; export { NNIManager };
...@@ -54,7 +54,7 @@ create table ExperimentProfile ( ...@@ -54,7 +54,7 @@ create table ExperimentProfile (
startTime integer, startTime integer,
endTime integer, endTime integer,
logDir text, logDir text,
maxSequenceId integer, nextSequenceId integer,
revision integer); revision integer);
create index ExperimentProfile_id on ExperimentProfile(id); create index ExperimentProfile_id on ExperimentProfile(id);
`; `;
...@@ -67,7 +67,7 @@ function loadExperimentProfile(row: any): ExperimentProfile { ...@@ -67,7 +67,7 @@ function loadExperimentProfile(row: any): ExperimentProfile {
startTime: row.startTime === null ? undefined : row.startTime, startTime: row.startTime === null ? undefined : row.startTime,
endTime: row.endTime === null ? undefined : row.endTime, endTime: row.endTime === null ? undefined : row.endTime,
logDir: row.logDir === null ? undefined : row.logDir, logDir: row.logDir === null ? undefined : row.logDir,
maxSequenceId: row.maxSequenceId, nextSequenceId: row.nextSequenceId,
revision: row.revision revision: row.revision
}; };
} }
...@@ -144,7 +144,7 @@ class SqlDB implements Database { ...@@ -144,7 +144,7 @@ class SqlDB implements Database {
exp.startTime === undefined ? null : exp.startTime, exp.startTime === undefined ? null : exp.startTime,
exp.endTime === undefined ? null : exp.endTime, exp.endTime === undefined ? null : exp.endTime,
exp.logDir === undefined ? null : exp.logDir, exp.logDir === undefined ? null : exp.logDir,
exp.maxSequenceId, exp.nextSequenceId,
exp.revision exp.revision
]; ];
this.log.trace(`storeExperimentProfile: SQL: ${sql}, args: ${JSON.stringify(args)}`); this.log.trace(`storeExperimentProfile: SQL: ${sql}, args: ${JSON.stringify(args)}`);
...@@ -183,7 +183,7 @@ class SqlDB implements Database { ...@@ -183,7 +183,7 @@ class SqlDB implements Database {
event: TrialJobEvent, trialJobId: string, timestamp: number, hyperParameter?: string, jobDetail?: TrialJobDetail): Promise<void> { event: TrialJobEvent, trialJobId: string, timestamp: number, hyperParameter?: string, jobDetail?: TrialJobDetail): Promise<void> {
const sql: string = 'insert into TrialJobEvent values (?,?,?,?,?,?)'; const sql: string = 'insert into TrialJobEvent values (?,?,?,?,?,?)';
const logPath: string | undefined = jobDetail === undefined ? undefined : jobDetail.url; const logPath: string | undefined = jobDetail === undefined ? undefined : jobDetail.url;
const sequenceId: number | undefined = jobDetail === undefined ? undefined : jobDetail.sequenceId; const sequenceId: number | undefined = jobDetail === undefined ? undefined : jobDetail.form.sequenceId;
const args: any[] = [timestamp, trialJobId, event, hyperParameter, logPath, sequenceId]; const args: any[] = [timestamp, trialJobId, event, hyperParameter, logPath, sequenceId];
this.log.trace(`storeTrialJobEvent: SQL: ${sql}, args: ${JSON.stringify(args)}`); this.log.trace(`storeTrialJobEvent: SQL: ${sql}, args: ${JSON.stringify(args)}`);
......
...@@ -80,7 +80,7 @@ describe('Unit test for dataStore', () => { ...@@ -80,7 +80,7 @@ describe('Unit test for dataStore', () => {
execDuration: 0, execDuration: 0,
startTime: Date.now(), startTime: Date.now(),
endTime: Date.now(), endTime: Date.now(),
maxSequenceId: 0, nextSequenceId: 0,
revision: 0 revision: 0
} }
const id: string = profile.id; const id: string = profile.id;
......
...@@ -41,9 +41,9 @@ class MockedTrainingService extends TrainingService { ...@@ -41,9 +41,9 @@ class MockedTrainingService extends TrainingService {
url: 'http://test', url: 'http://test',
workingDirectory: '/tmp/mocked', workingDirectory: '/tmp/mocked',
form: { form: {
jobType: 'TRIAL' sequenceId: 0,
hyperParameters: { value: '', index: 0 }
}, },
sequenceId: 0
}; };
public jobDetail2: TrialJobDetail = { public jobDetail2: TrialJobDetail = {
id: '3456', id: '3456',
...@@ -55,9 +55,9 @@ class MockedTrainingService extends TrainingService { ...@@ -55,9 +55,9 @@ class MockedTrainingService extends TrainingService {
url: 'http://test', url: 'http://test',
workingDirectory: '/tmp/mocked', workingDirectory: '/tmp/mocked',
form: { form: {
jobType: 'TRIAL' sequenceId: 1,
hyperParameters: { value: '', index: 1 }
}, },
sequenceId: 0
}; };
public listTrialJobs(): Promise<TrialJobDetail[]> { public listTrialJobs(): Promise<TrialJobDetail[]> {
......
...@@ -101,7 +101,7 @@ describe('Unit test for nnimanager', function () { ...@@ -101,7 +101,7 @@ describe('Unit test for nnimanager', function () {
params: updateExperimentParams, params: updateExperimentParams,
id: 'test', id: 'test',
execDuration: 0, execDuration: 0,
maxSequenceId: 0, nextSequenceId: 0,
revision: 0 revision: 0
} }
......
...@@ -64,10 +64,10 @@ const expParams2: ExperimentParams = { ...@@ -64,10 +64,10 @@ const expParams2: ExperimentParams = {
}; };
const profiles: ExperimentProfile[] = [ const profiles: ExperimentProfile[] = [
{ params: expParams1, id: '#1', execDuration: 0, logDir: '/log', startTime: Date.now(), endTime: undefined, maxSequenceId: 0, revision: 1,}, { params: expParams1, id: '#1', execDuration: 0, logDir: '/log', startTime: Date.now(), endTime: undefined, nextSequenceId: 0, revision: 1,},
{ params: expParams1, id: '#1', execDuration: 0, logDir: '/log', startTime: Date.now(), endTime: Date.now(), maxSequenceId: 0, revision: 2 }, { params: expParams1, id: '#1', execDuration: 0, logDir: '/log', startTime: Date.now(), endTime: Date.now(), nextSequenceId: 1, revision: 2 },
{ params: expParams2, id: '#2', execDuration: 0, logDir: '/log', startTime: Date.now(), endTime: Date.now(), maxSequenceId: 0, revision: 2 }, { params: expParams2, id: '#2', execDuration: 0, logDir: '/log', startTime: Date.now(), endTime: Date.now(), nextSequenceId: 0, revision: 2 },
{ params: expParams2, id: '#2', execDuration: 0, logDir: '/log', startTime: Date.now(), endTime: Date.now(), maxSequenceId: 0, revision: 3 } { params: expParams2, id: '#2', execDuration: 0, logDir: '/log', startTime: Date.now(), endTime: Date.now(), nextSequenceId: 2, revision: 3 }
]; ];
const events: TrialJobEventRecord[] = [ const events: TrialJobEventRecord[] = [
......
...@@ -72,6 +72,8 @@ class NNIRestHandler { ...@@ -72,6 +72,8 @@ class NNIRestHandler {
this.addTrialJob(router); this.addTrialJob(router);
this.cancelTrialJob(router); this.cancelTrialJob(router);
this.getMetricData(router); this.getMetricData(router);
this.getMetricDataByRange(router);
this.getLatestMetricData(router);
this.exportData(router); this.exportData(router);
// Express-joi-validator configuration // Express-joi-validator configuration
...@@ -262,6 +264,28 @@ class NNIRestHandler { ...@@ -262,6 +264,28 @@ class NNIRestHandler {
}); });
} }
private getMetricDataByRange(router: Router): void {
router.get('/metric-data-range/:min_seq_id/:max_seq_id', async (req: Request, res: Response) => {
const minSeqId = Number(req.params.min_seq_id);
const maxSeqId = Number(req.params.max_seq_id);
this.nniManager.getMetricDataByRange(minSeqId, maxSeqId).then((metricsData: MetricDataRecord[]) => {
res.send(metricsData);
}).catch((err: Error) => {
this.handle_error(err, res);
});
});
}
private getLatestMetricData(router: Router): void {
router.get('/metric-data-latest/', async (req: Request, res: Response) => {
this.nniManager.getLatestMetricData().then((metricsData: MetricDataRecord[]) => {
res.send(metricsData);
}).catch((err: Error) => {
this.handle_error(err, res);
});
});
}
private exportData(router: Router): void { private exportData(router: Router): void {
router.get('/export-data', (req: Request, res: Response) => { router.get('/export-data', (req: Request, res: Response) => {
this.nniManager.exportData().then((exportedData: string) => { this.nniManager.exportData().then((exportedData: string) => {
......
...@@ -209,7 +209,7 @@ export namespace ValidationSchemas { ...@@ -209,7 +209,7 @@ export namespace ValidationSchemas {
startTime: joi.number(), startTime: joi.number(),
endTime: joi.number(), endTime: joi.number(),
logDir: joi.string(), logDir: joi.string(),
maxSequenceId: joi.number() nextSequenceId: joi.number()
} }
}; };
} }
...@@ -85,9 +85,9 @@ export class MockedNNIManager extends Manager { ...@@ -85,9 +85,9 @@ export class MockedNNIManager extends Manager {
// tslint:disable-next-line:no-http-string // tslint:disable-next-line:no-http-string
url: 'http://test', url: 'http://test',
workingDirectory: '/tmp/mocked', workingDirectory: '/tmp/mocked',
sequenceId: 0,
form: { form: {
jobType: 'TRIAL' sequenceId: 0,
hyperParameters: { value: '', index: 0 }
} }
}; };
deferred.resolve(jobDetail); deferred.resolve(jobDetail);
...@@ -129,6 +129,12 @@ export class MockedNNIManager extends Manager { ...@@ -129,6 +129,12 @@ export class MockedNNIManager extends Manager {
public getMetricData(trialJobId: string, metricType: MetricType): Promise<MetricDataRecord[]> { public getMetricData(trialJobId: string, metricType: MetricType): Promise<MetricDataRecord[]> {
throw new MethodNotImplementedError(); throw new MethodNotImplementedError();
} }
public getMetricDataByRange(minSeqId: number, maxSeqId: number): Promise<MetricDataRecord[]> {
throw new MethodNotImplementedError();
}
public getLatestMetricData(): Promise<MetricDataRecord[]> {
throw new MethodNotImplementedError();
}
public getExperimentProfile(): Promise<ExperimentProfile> { public getExperimentProfile(): Promise<ExperimentProfile> {
const profile: ExperimentProfile = { const profile: ExperimentProfile = {
params: { params: {
...@@ -148,7 +154,7 @@ export class MockedNNIManager extends Manager { ...@@ -148,7 +154,7 @@ export class MockedNNIManager extends Manager {
execDuration: 0, execDuration: 0,
startTime: Date.now(), startTime: Date.now(),
endTime: Date.now(), endTime: Date.now(),
maxSequenceId: 0, nextSequenceId: 0,
revision: 0 revision: 0
}; };
......
...@@ -25,7 +25,7 @@ import * as path from 'path'; ...@@ -25,7 +25,7 @@ import * as path from 'path';
import * as component from '../../../common/component'; import * as component from '../../../common/component';
import { getExperimentId } from '../../../common/experimentStartupInfo'; import { getExperimentId } from '../../../common/experimentStartupInfo';
import { import {
JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail, TrialJobStatus NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail, TrialJobStatus
} from '../../../common/trainingService'; } from '../../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils'; import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
...@@ -55,7 +55,6 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -55,7 +55,6 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
super(); super();
this.fcJobInfoCollector = new FrameworkControllerJobInfoCollector(this.trialJobsMap); this.fcJobInfoCollector = new FrameworkControllerJobInfoCollector(this.trialJobsMap);
this.experimentId = getExperimentId(); this.experimentId = getExperimentId();
this.nextTrialSequenceId = -1;
} }
public async run(): Promise<void> { public async run(): Promise<void> {
...@@ -77,7 +76,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -77,7 +76,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
} }
} }
public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> { public async submitTrialJob(form: TrialJobApplicationForm): Promise<TrialJobDetail> {
if (this.fcClusterConfig === undefined) { if (this.fcClusterConfig === undefined) {
throw new Error('frameworkcontrollerClusterConfig is not initialized'); throw new Error('frameworkcontrollerClusterConfig is not initialized');
} }
...@@ -91,14 +90,13 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -91,14 +90,13 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
} }
const trialJobId: string = uniqueString(5); const trialJobId: string = uniqueString(5);
const curTrialSequenceId: number = this.generateSequenceId();
// Set trial's NFS working folder // Set trial's NFS working folder
const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId); const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId);
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId); const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
const frameworkcontrollerJobName: string = `nniexp${this.experimentId}trial${trialJobId}`.toLowerCase(); const frameworkcontrollerJobName: string = `nniexp${this.experimentId}trial${trialJobId}`.toLowerCase();
//Generate the port used for taskRole //Generate the port used for taskRole
this.generateContainerPort(); this.generateContainerPort();
await this.prepareRunScript(trialLocalTempFolder, curTrialSequenceId, trialJobId, trialWorkingFolder, form); await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, form);
//upload code files //upload code files
const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder);
...@@ -113,7 +111,6 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -113,7 +111,6 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
trialWorkingFolder, trialWorkingFolder,
form, form,
frameworkcontrollerJobName, frameworkcontrollerJobName,
curTrialSequenceId,
trialJobOutputUrl trialJobOutputUrl
); );
...@@ -248,8 +245,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -248,8 +245,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
return `${portScript} . /mnt/frameworkbarrier/injector.sh && ${command}`; return `${portScript} . /mnt/frameworkbarrier/injector.sh && ${command}`;
} }
private async prepareRunScript(trialLocalTempFolder: string, curTrialSequenceId: number, trialJobId: string, private async prepareRunScript(trialLocalTempFolder: string, trialJobId: string,
trialWorkingFolder: string, form: JobApplicationForm): Promise<void> { trialWorkingFolder: string, form: TrialJobApplicationForm): Promise<void> {
if (this.fcTrialConfig === undefined) { if (this.fcTrialConfig === undefined) {
throw new Error('frameworkcontroller trial config is not initialized'); throw new Error('frameworkcontroller trial config is not initialized');
} }
...@@ -264,16 +261,16 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -264,16 +261,16 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
for (const taskRole of this.fcTrialConfig.taskRoles) { for (const taskRole of this.fcTrialConfig.taskRoles) {
const runScriptContent: string = const runScriptContent: string =
await this.generateRunScript('frameworkcontroller', trialJobId, trialWorkingFolder, await this.generateRunScript('frameworkcontroller', trialJobId, trialWorkingFolder,
this.generateCommandScript(taskRole.command), curTrialSequenceId.toString(), this.generateCommandScript(taskRole.command), form.sequenceId.toString(),
taskRole.name, taskRole.gpuNum); taskRole.name, taskRole.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, `run_${taskRole.name}.sh`), runScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, `run_${taskRole.name}.sh`), runScriptContent, { encoding: 'utf8' });
} }
// Write file content ( parameter.cfg ) to local tmp folders // Write file content ( parameter.cfg ) to local tmp folders
const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form); const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form);
if (trialForm !== undefined && trialForm.hyperParameters !== undefined) { if (form !== undefined) {
await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)), await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(form.hyperParameters)),
trialForm.hyperParameters.value, { encoding: 'utf8' }); form.hyperParameters.value, { encoding: 'utf8' });
} }
} }
......
...@@ -27,7 +27,7 @@ import * as component from '../../../common/component'; ...@@ -27,7 +27,7 @@ import * as component from '../../../common/component';
import { getExperimentId } from '../../../common/experimentStartupInfo'; import { getExperimentId } from '../../../common/experimentStartupInfo';
import { import {
JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail, TrialJobStatus NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail, TrialJobStatus
} from '../../../common/trainingService'; } from '../../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils'; import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
...@@ -59,7 +59,6 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -59,7 +59,6 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
super(); super();
this.kubeflowJobInfoCollector = new KubeflowJobInfoCollector(this.trialJobsMap); this.kubeflowJobInfoCollector = new KubeflowJobInfoCollector(this.trialJobsMap);
this.experimentId = getExperimentId(); this.experimentId = getExperimentId();
this.nextTrialSequenceId = -1;
this.log.info('Construct Kubeflow training service.'); this.log.info('Construct Kubeflow training service.');
} }
...@@ -84,7 +83,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -84,7 +83,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
this.log.info('Kubeflow training service exit.'); this.log.info('Kubeflow training service exit.');
} }
public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> { public async submitTrialJob(form: TrialJobApplicationForm): Promise<TrialJobDetail> {
if (this.kubernetesCRDClient === undefined) { if (this.kubernetesCRDClient === undefined) {
throw new Error('Kubeflow job operator client is undefined'); throw new Error('Kubeflow job operator client is undefined');
} }
...@@ -96,10 +95,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -96,10 +95,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
const trialJobId: string = uniqueString(5); const trialJobId: string = uniqueString(5);
const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId); const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId);
const kubeflowJobName: string = `nni-exp-${this.experimentId}-trial-${trialJobId}`.toLowerCase(); const kubeflowJobName: string = `nni-exp-${this.experimentId}-trial-${trialJobId}`.toLowerCase();
const curTrialSequenceId: number = this.generateSequenceId();
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId); const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
//prepare the runscript //prepare the runscript
await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, curTrialSequenceId, form); await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, form);
//upload files to sotrage //upload files to sotrage
const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder);
let initStatus: TrialJobStatus = 'WAITING'; let initStatus: TrialJobStatus = 'WAITING';
...@@ -113,7 +111,6 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -113,7 +111,6 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
trialWorkingFolder, trialWorkingFolder,
form, form,
kubeflowJobName, kubeflowJobName,
curTrialSequenceId,
trialJobOutputUrl trialJobOutputUrl
); );
...@@ -236,8 +233,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -236,8 +233,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
return Promise.resolve(trialJobOutputUrl); return Promise.resolve(trialJobOutputUrl);
} }
private async prepareRunScript(trialLocalTempFolder: string, trialJobId: string, trialWorkingFolder: string, curTrialSequenceId: number, private async prepareRunScript(trialLocalTempFolder: string, trialJobId: string, trialWorkingFolder: string,
form: JobApplicationForm): Promise<void> { form: TrialJobApplicationForm): Promise<void> {
if (this.kubeflowClusterConfig === undefined) { if (this.kubeflowClusterConfig === undefined) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
...@@ -262,7 +259,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -262,7 +259,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if (kubeflowTrialConfig.worker !== undefined) { if (kubeflowTrialConfig.worker !== undefined) {
const workerRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder, const workerRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder,
kubeflowTrialConfig.worker.command, kubeflowTrialConfig.worker.command,
curTrialSequenceId.toString(), 'worker', form.sequenceId.toString(), 'worker',
kubeflowTrialConfig.worker.gpuNum); kubeflowTrialConfig.worker.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_worker.sh'), workerRunScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_worker.sh'), workerRunScriptContent, { encoding: 'utf8' });
} }
...@@ -272,7 +269,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -272,7 +269,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if (tensorflowTrialConfig.ps !== undefined) { if (tensorflowTrialConfig.ps !== undefined) {
const psRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder, const psRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder,
tensorflowTrialConfig.ps.command, tensorflowTrialConfig.ps.command,
curTrialSequenceId.toString(), form.sequenceId.toString(),
'ps', tensorflowTrialConfig.ps.gpuNum); 'ps', tensorflowTrialConfig.ps.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_ps.sh'), psRunScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_ps.sh'), psRunScriptContent, { encoding: 'utf8' });
} }
...@@ -281,16 +278,15 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -281,16 +278,15 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if (pytorchTrialConfig.master !== undefined) { if (pytorchTrialConfig.master !== undefined) {
const masterRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder, const masterRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder,
pytorchTrialConfig.master.command, pytorchTrialConfig.master.command,
curTrialSequenceId.toString(), 'master', form.sequenceId.toString(), 'master',
pytorchTrialConfig.master.gpuNum); pytorchTrialConfig.master.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_master.sh'), masterRunScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_master.sh'), masterRunScriptContent, { encoding: 'utf8' });
} }
} }
// Write file content ( parameter.cfg ) to local tmp folders // Write file content ( parameter.cfg ) to local tmp folders
const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form); if (form !== undefined) {
if (trialForm !== undefined && trialForm.hyperParameters !== undefined) { await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(form.hyperParameters)),
await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)), form.hyperParameters.value, { encoding: 'utf8' });
trialForm.hyperParameters.value, { encoding: 'utf8' });
} }
} }
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
'use strict'; 'use strict';
import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
/** /**
* KubeflowTrialJobDetail * KubeflowTrialJobDetail
...@@ -33,21 +33,19 @@ export class KubernetesTrialJobDetail implements TrialJobDetail { ...@@ -33,21 +33,19 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
public tags?: string[]; public tags?: string[];
public url?: string; public url?: string;
public workingDirectory: string; public workingDirectory: string;
public form: JobApplicationForm; public form: TrialJobApplicationForm;
public kubernetesJobName: string; public kubernetesJobName: string;
public sequenceId: number;
public queryJobFailedCount: number; public queryJobFailedCount: number;
constructor(id: string, status: TrialJobStatus, submitTime: number, constructor(id: string, status: TrialJobStatus, submitTime: number,
workingDirectory: string, form: JobApplicationForm, workingDirectory: string, form: TrialJobApplicationForm,
kubernetesJobName: string, sequenceId: number, url: string) { kubernetesJobName: string, url: string) {
this.id = id; this.id = id;
this.status = status; this.status = status;
this.submitTime = submitTime; this.submitTime = submitTime;
this.workingDirectory = workingDirectory; this.workingDirectory = workingDirectory;
this.form = form; this.form = form;
this.kubernetesJobName = kubernetesJobName; this.kubernetesJobName = kubernetesJobName;
this.sequenceId = sequenceId;
this.tags = []; this.tags = [];
this.queryJobFailedCount = 0; this.queryJobFailedCount = 0;
this.url = url; this.url = url;
......
...@@ -26,7 +26,7 @@ import * as azureStorage from 'azure-storage'; ...@@ -26,7 +26,7 @@ import * as azureStorage from 'azure-storage';
import { EventEmitter } from 'events'; import { EventEmitter } from 'events';
import { Base64 } from 'js-base64'; import { Base64 } from 'js-base64';
import { String } from 'typescript-string-operations'; import { String } from 'typescript-string-operations';
import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo'; import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { import {
NNIManagerIpConfig, TrialJobDetail, TrialJobMetric NNIManagerIpConfig, TrialJobDetail, TrialJobMetric
...@@ -53,7 +53,6 @@ abstract class KubernetesTrainingService { ...@@ -53,7 +53,6 @@ abstract class KubernetesTrainingService {
protected readonly trialLocalNFSTempFolder: string; protected readonly trialLocalNFSTempFolder: string;
protected stopping: boolean = false; protected stopping: boolean = false;
protected experimentId! : string; protected experimentId! : string;
protected nextTrialSequenceId: number;
protected kubernetesRestServerPort?: number; protected kubernetesRestServerPort?: number;
protected readonly CONTAINER_MOUNT_PATH: string; protected readonly CONTAINER_MOUNT_PATH: string;
protected azureStorageClient?: azureStorage.FileService; protected azureStorageClient?: azureStorage.FileService;
...@@ -74,7 +73,6 @@ abstract class KubernetesTrainingService { ...@@ -74,7 +73,6 @@ abstract class KubernetesTrainingService {
this.trialJobsMap = new Map<string, KubernetesTrialJobDetail>(); this.trialJobsMap = new Map<string, KubernetesTrialJobDetail>();
this.trialLocalNFSTempFolder = path.join(getExperimentRootDir(), 'trials-nfs-tmp'); this.trialLocalNFSTempFolder = path.join(getExperimentRootDir(), 'trials-nfs-tmp');
this.experimentId = getExperimentId(); this.experimentId = getExperimentId();
this.nextTrialSequenceId = -1;
this.CONTAINER_MOUNT_PATH = '/tmp/mount'; this.CONTAINER_MOUNT_PATH = '/tmp/mount';
this.genericK8sClient = new GeneralK8sClient(); this.genericK8sClient = new GeneralK8sClient();
this.logCollection = 'none'; this.logCollection = 'none';
...@@ -93,10 +91,8 @@ abstract class KubernetesTrainingService { ...@@ -93,10 +91,8 @@ abstract class KubernetesTrainingService {
const jobs: TrialJobDetail[] = []; const jobs: TrialJobDetail[] = [];
for (const [key, value] of this.trialJobsMap) { for (const [key, value] of this.trialJobsMap) {
if (value.form.jobType === 'TRIAL') {
jobs.push(await this.getTrialJob(key)); jobs.push(await this.getTrialJob(key));
} }
}
return Promise.resolve(jobs); return Promise.resolve(jobs);
} }
...@@ -222,14 +218,6 @@ abstract class KubernetesTrainingService { ...@@ -222,14 +218,6 @@ abstract class KubernetesTrainingService {
return Promise.resolve(); return Promise.resolve();
} }
protected generateSequenceId(): number {
if (this.nextTrialSequenceId === -1) {
this.nextTrialSequenceId = getInitTrialSequenceId();
}
return this.nextTrialSequenceId++;
}
// tslint:disable: no-unsafe-any no-any // tslint:disable: no-unsafe-any no-any
protected async createAzureStorage(vaultName: string, valutKeyName: string, accountName: string, azureShare: string): Promise<void> { protected async createAzureStorage(vaultName: string, valutKeyName: string, accountName: string, azureShare: string): Promise<void> {
try { try {
......
...@@ -26,10 +26,10 @@ import * as path from 'path'; ...@@ -26,10 +26,10 @@ import * as path from 'path';
import * as ts from 'tail-stream'; import * as ts from 'tail-stream';
import * as tkill from 'tree-kill'; import * as tkill from 'tree-kill';
import { NNIError, NNIErrorNames } from '../../common/errors'; import { NNIError, NNIErrorNames } from '../../common/errors';
import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo'; import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { import {
HostJobApplicationForm, HyperParameters, JobApplicationForm, TrainingService, TrialJobApplicationForm, HyperParameters, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric, TrialJobStatus TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../../common/trainingService'; } from '../../common/trainingService';
import { import {
...@@ -76,21 +76,19 @@ class LocalTrialJobDetail implements TrialJobDetail { ...@@ -76,21 +76,19 @@ class LocalTrialJobDetail implements TrialJobDetail {
public tags?: string[]; public tags?: string[];
public url?: string; public url?: string;
public workingDirectory: string; public workingDirectory: string;
public form: JobApplicationForm; public form: TrialJobApplicationForm;
public sequenceId: number;
public pid?: number; public pid?: number;
public gpuIndices?: number[]; public gpuIndices?: number[];
constructor( constructor(
id: string, status: TrialJobStatus, submitTime: number, id: string, status: TrialJobStatus, submitTime: number,
workingDirectory: string, form: JobApplicationForm, sequenceId: number) { workingDirectory: string, form: TrialJobApplicationForm) {
this.id = id; this.id = id;
this.status = status; this.status = status;
this.submitTime = submitTime; this.submitTime = submitTime;
this.workingDirectory = workingDirectory; this.workingDirectory = workingDirectory;
this.form = form; this.form = form;
this.url = `file://localhost:${workingDirectory}`; this.url = `file://localhost:${workingDirectory}`;
this.sequenceId = sequenceId;
this.gpuIndices = []; this.gpuIndices = [];
} }
} }
...@@ -125,7 +123,6 @@ class LocalTrainingService implements TrainingService { ...@@ -125,7 +123,6 @@ class LocalTrainingService implements TrainingService {
private initialized: boolean; private initialized: boolean;
private stopping: boolean; private stopping: boolean;
private rootDir!: string; private rootDir!: string;
private trialSequenceId: number;
private readonly experimentId! : string; private readonly experimentId! : string;
private gpuScheduler!: GPUScheduler; private gpuScheduler!: GPUScheduler;
private readonly occupiedGpuIndexNumMap: Map<number, number>; private readonly occupiedGpuIndexNumMap: Map<number, number>;
...@@ -145,7 +142,6 @@ class LocalTrainingService implements TrainingService { ...@@ -145,7 +142,6 @@ class LocalTrainingService implements TrainingService {
this.initialized = false; this.initialized = false;
this.stopping = false; this.stopping = false;
this.log = getLogger(); this.log = getLogger();
this.trialSequenceId = -1;
this.experimentId = getExperimentId(); this.experimentId = getExperimentId();
this.jobStreamMap = new Map<string, ts.Stream>(); this.jobStreamMap = new Map<string, ts.Stream>();
this.log.info('Construct local machine training service.'); this.log.info('Construct local machine training service.');
...@@ -169,10 +165,8 @@ class LocalTrainingService implements TrainingService { ...@@ -169,10 +165,8 @@ class LocalTrainingService implements TrainingService {
const jobs: TrialJobDetail[] = []; const jobs: TrialJobDetail[] = [];
for (const key of this.jobMap.keys()) { for (const key of this.jobMap.keys()) {
const trialJob: TrialJobDetail = await this.getTrialJob(key); const trialJob: TrialJobDetail = await this.getTrialJob(key);
if (trialJob.form.jobType === 'TRIAL') {
jobs.push(trialJob); jobs.push(trialJob);
} }
}
return jobs; return jobs;
} }
...@@ -182,9 +176,6 @@ class LocalTrainingService implements TrainingService { ...@@ -182,9 +176,6 @@ class LocalTrainingService implements TrainingService {
if (trialJob === undefined) { if (trialJob === undefined) {
throw new NNIError(NNIErrorNames.NOT_FOUND, 'Trial job not found'); throw new NNIError(NNIErrorNames.NOT_FOUND, 'Trial job not found');
} }
if (trialJob.form.jobType === 'HOST') {
return this.getHostJob(trialJobId);
}
if (trialJob.status === 'RUNNING') { if (trialJob.status === 'RUNNING') {
const alive: boolean = await isAlive(trialJob.pid); const alive: boolean = await isAlive(trialJob.pid);
if (!alive) { if (!alive) {
...@@ -219,18 +210,14 @@ class LocalTrainingService implements TrainingService { ...@@ -219,18 +210,14 @@ class LocalTrainingService implements TrainingService {
this.eventEmitter.off('metric', listener); this.eventEmitter.off('metric', listener);
} }
public submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> { public submitTrialJob(form: TrialJobApplicationForm): Promise<TrialJobDetail> {
if (form.jobType === 'HOST') {
return this.runHostJob(<HostJobApplicationForm>form);
} else if (form.jobType === 'TRIAL') {
const trialJobId: string = uniqueString(5); const trialJobId: string = uniqueString(5);
const trialJobDetail: LocalTrialJobDetail = new LocalTrialJobDetail( const trialJobDetail: LocalTrialJobDetail = new LocalTrialJobDetail(
trialJobId, trialJobId,
'WAITING', 'WAITING',
Date.now(), Date.now(),
path.join(this.rootDir, 'trials', trialJobId), path.join(this.rootDir, 'trials', trialJobId),
form, form
this.generateSequenceId()
); );
this.jobQueue.push(trialJobId); this.jobQueue.push(trialJobId);
this.jobMap.set(trialJobId, trialJobDetail); this.jobMap.set(trialJobId, trialJobDetail);
...@@ -238,9 +225,6 @@ class LocalTrainingService implements TrainingService { ...@@ -238,9 +225,6 @@ class LocalTrainingService implements TrainingService {
this.log.debug(`submitTrialJob: return: ${JSON.stringify(trialJobDetail)} `); this.log.debug(`submitTrialJob: return: ${JSON.stringify(trialJobDetail)} `);
return Promise.resolve(trialJobDetail); return Promise.resolve(trialJobDetail);
} else {
return Promise.reject(new Error(`Job form not supported: ${JSON.stringify(form)}`));
}
} }
/** /**
...@@ -248,16 +232,12 @@ class LocalTrainingService implements TrainingService { ...@@ -248,16 +232,12 @@ class LocalTrainingService implements TrainingService {
* @param trialJobId trial job id * @param trialJobId trial job id
* @param form job application form * @param form job application form
*/ */
public async updateTrialJob(trialJobId: string, form: JobApplicationForm): Promise<TrialJobDetail> { public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise<TrialJobDetail> {
const trialJobDetail: undefined | TrialJobDetail = this.jobMap.get(trialJobId); const trialJobDetail: undefined | TrialJobDetail = this.jobMap.get(trialJobId);
if (trialJobDetail === undefined) { if (trialJobDetail === undefined) {
throw new Error(`updateTrialJob failed: ${trialJobId} not found`); throw new Error(`updateTrialJob failed: ${trialJobId} not found`);
} }
if (form.jobType === 'TRIAL') { await this.writeParameterFile(trialJobDetail.workingDirectory, form.hyperParameters);
await this.writeParameterFile(trialJobDetail.workingDirectory, (<TrialJobApplicationForm>form).hyperParameters);
} else {
throw new Error(`updateTrialJob failed: jobType ${form.jobType} not supported.`);
}
return trialJobDetail; return trialJobDetail;
} }
...@@ -279,13 +259,7 @@ class LocalTrainingService implements TrainingService { ...@@ -279,13 +259,7 @@ class LocalTrainingService implements TrainingService {
return Promise.resolve(); return Promise.resolve();
} }
if (trialJob.form.jobType === 'TRIAL') {
tkill(trialJob.pid, 'SIGKILL'); tkill(trialJob.pid, 'SIGKILL');
} else if (trialJob.form.jobType === 'HOST') {
await cpp.exec(`pkill -9 -P ${trialJob.pid}`);
} else {
throw new Error(`Job type not supported: ${trialJob.form.jobType}`);
}
this.setTrialJobStatus(trialJob, getJobCancelStatus(isEarlyStopped)); this.setTrialJobStatus(trialJob, getJobCancelStatus(isEarlyStopped));
return Promise.resolve(); return Promise.resolve();
...@@ -409,7 +383,7 @@ class LocalTrainingService implements TrainingService { ...@@ -409,7 +383,7 @@ class LocalTrainingService implements TrainingService {
{ key: 'NNI_SYS_DIR', value: trialJobDetail.workingDirectory }, { key: 'NNI_SYS_DIR', value: trialJobDetail.workingDirectory },
{ key: 'NNI_TRIAL_JOB_ID', value: trialJobDetail.id }, { key: 'NNI_TRIAL_JOB_ID', value: trialJobDetail.id },
{ key: 'NNI_OUTPUT_DIR', value: trialJobDetail.workingDirectory }, { key: 'NNI_OUTPUT_DIR', value: trialJobDetail.workingDirectory },
{ key: 'NNI_TRIAL_SEQ_ID', value: trialJobDetail.sequenceId.toString() }, { key: 'NNI_TRIAL_SEQ_ID', value: trialJobDetail.form.sequenceId.toString() },
{ key: 'MULTI_PHASE', value: this.isMultiPhase.toString() } { key: 'MULTI_PHASE', value: this.isMultiPhase.toString() }
]; ];
if (gpuNum !== undefined) { if (gpuNum !== undefined) {
...@@ -562,7 +536,7 @@ class LocalTrainingService implements TrainingService { ...@@ -562,7 +536,7 @@ class LocalTrainingService implements TrainingService {
const scriptName: string = getScriptName('run'); const scriptName: string = getScriptName('run');
await fs.promises.writeFile(path.join(trialJobDetail.workingDirectory, scriptName), await fs.promises.writeFile(path.join(trialJobDetail.workingDirectory, scriptName),
runScriptContent.join(getNewLine()), { encoding: 'utf8', mode: 0o777 }); runScriptContent.join(getNewLine()), { encoding: 'utf8', mode: 0o777 });
await this.writeParameterFile(trialJobDetail.workingDirectory, (<TrialJobApplicationForm>trialJobDetail.form).hyperParameters); await this.writeParameterFile(trialJobDetail.workingDirectory, trialJobDetail.form.hyperParameters);
const trialJobProcess: cp.ChildProcess = runScript(path.join(trialJobDetail.workingDirectory, scriptName)); const trialJobProcess: cp.ChildProcess = runScript(path.join(trialJobDetail.workingDirectory, scriptName));
this.setTrialJobStatus(trialJobDetail, 'RUNNING'); this.setTrialJobStatus(trialJobDetail, 'RUNNING');
trialJobDetail.startTime = Date.now(); trialJobDetail.startTime = Date.now();
...@@ -589,60 +563,10 @@ class LocalTrainingService implements TrainingService { ...@@ -589,60 +563,10 @@ class LocalTrainingService implements TrainingService {
this.jobStreamMap.set(trialJobDetail.id, stream); this.jobStreamMap.set(trialJobDetail.id, stream);
} }
private async runHostJob(form: HostJobApplicationForm): Promise<TrialJobDetail> {
const jobId: string = uniqueString(5);
const workDir: string = path.join(this.rootDir, 'hostjobs', jobId);
await cpp.exec(`mkdir -p ${workDir}`);
const wrappedCmd: string = `cd ${workDir} && ${form.cmd}>stdout 2>stderr`;
this.log.debug(`runHostJob: command: ${wrappedCmd}`);
const process: cp.ChildProcess = cp.exec(wrappedCmd);
const jobDetail: LocalTrialJobDetail = {
id: jobId,
status: 'RUNNING',
submitTime: Date.now(),
workingDirectory: workDir,
form: form,
sequenceId: this.generateSequenceId(),
pid: process.pid
};
this.jobMap.set(jobId, jobDetail);
this.log.debug(`runHostJob: return: ${JSON.stringify(jobDetail)} `);
return jobDetail;
}
private async getHostJob(jobId: string): Promise<TrialJobDetail> {
const jobDetail: LocalTrialJobDetail | undefined = this.jobMap.get(jobId);
if (jobDetail === undefined) {
throw new NNIError(NNIErrorNames.NOT_FOUND, `Host Job not found: ${jobId}`);
}
try {
await cpp.exec(`kill -0 ${jobDetail.pid}`);
return jobDetail;
} catch (error) {
if (error instanceof Error) {
this.log.debug(`getHostJob: error: ${error.message}`);
this.jobMap.delete(jobId);
throw new NNIError(NNIErrorNames.NOT_FOUND, `Host Job not found: ${error.message}`);
} else {
throw error;
}
}
}
private async writeParameterFile(directory: string, hyperParameters: HyperParameters): Promise<void> { private async writeParameterFile(directory: string, hyperParameters: HyperParameters): Promise<void> {
const filepath: string = path.join(directory, generateParamFileName(hyperParameters)); const filepath: string = path.join(directory, generateParamFileName(hyperParameters));
await fs.promises.writeFile(filepath, hyperParameters.value, { encoding: 'utf8' }); await fs.promises.writeFile(filepath, hyperParameters.value, { encoding: 'utf8' });
} }
private generateSequenceId(): number {
if (this.trialSequenceId === -1) {
this.trialSequenceId = getInitTrialSequenceId();
}
return this.trialSequenceId++;
}
} }
export { LocalTrainingService }; export { LocalTrainingService };
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
'use strict'; 'use strict';
import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
/** /**
* PAI trial job detail * PAI trial job detail
...@@ -34,20 +34,18 @@ export class PAITrialJobDetail implements TrialJobDetail { ...@@ -34,20 +34,18 @@ export class PAITrialJobDetail implements TrialJobDetail {
public tags?: string[]; public tags?: string[];
public url?: string; public url?: string;
public workingDirectory: string; public workingDirectory: string;
public form: JobApplicationForm; public form: TrialJobApplicationForm;
public sequenceId: number;
public hdfsLogPath: string; public hdfsLogPath: string;
public isEarlyStopped?: boolean; public isEarlyStopped?: boolean;
constructor(id: string, status: TrialJobStatus, paiJobName : string, constructor(id: string, status: TrialJobStatus, paiJobName : string,
submitTime: number, workingDirectory: string, form: JobApplicationForm, sequenceId: number, hdfsLogPath: string) { submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, hdfsLogPath: string) {
this.id = id; this.id = id;
this.status = status; this.status = status;
this.paiJobName = paiJobName; this.paiJobName = paiJobName;
this.submitTime = submitTime; this.submitTime = submitTime;
this.workingDirectory = workingDirectory; this.workingDirectory = workingDirectory;
this.form = form; this.form = form;
this.sequenceId = sequenceId;
this.tags = []; this.tags = [];
this.hdfsLogPath = hdfsLogPath; this.hdfsLogPath = hdfsLogPath;
} }
......
...@@ -30,10 +30,10 @@ import { EventEmitter } from 'events'; ...@@ -30,10 +30,10 @@ import { EventEmitter } from 'events';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import { String } from 'typescript-string-operations'; import { String } from 'typescript-string-operations';
import { MethodNotImplementedError } from '../../common/errors'; import { MethodNotImplementedError } from '../../common/errors';
import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo'; import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { import {
HyperParameters, JobApplicationForm, NNIManagerIpConfig, TrainingService, HyperParameters, NNIManagerIpConfig, TrainingService,
TrialJobApplicationForm, TrialJobDetail, TrialJobMetric TrialJobApplicationForm, TrialJobDetail, TrialJobMetric
} from '../../common/trainingService'; } from '../../common/trainingService';
import { delay, generateParamFileName, import { delay, generateParamFileName,
...@@ -70,7 +70,6 @@ class PAITrainingService implements TrainingService { ...@@ -70,7 +70,6 @@ class PAITrainingService implements TrainingService {
private readonly paiTokenUpdateInterval: number; private readonly paiTokenUpdateInterval: number;
private readonly experimentId! : string; private readonly experimentId! : string;
private readonly paiJobCollector : PAIJobInfoCollector; private readonly paiJobCollector : PAIJobInfoCollector;
private nextTrialSequenceId: number;
private paiRestServerPort?: number; private paiRestServerPort?: number;
private nniManagerIpConfig?: NNIManagerIpConfig; private nniManagerIpConfig?: NNIManagerIpConfig;
private copyExpCodeDirPromise?: Promise<void>; private copyExpCodeDirPromise?: Promise<void>;
...@@ -90,7 +89,6 @@ class PAITrainingService implements TrainingService { ...@@ -90,7 +89,6 @@ class PAITrainingService implements TrainingService {
this.expRootDir = path.join('/nni', 'experiments', getExperimentId()); this.expRootDir = path.join('/nni', 'experiments', getExperimentId());
this.experimentId = getExperimentId(); this.experimentId = getExperimentId();
this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap); this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap);
this.nextTrialSequenceId = -1;
this.paiTokenUpdateInterval = 7200000; //2hours this.paiTokenUpdateInterval = 7200000; //2hours
this.logCollection = 'none'; this.logCollection = 'none';
this.log.info('Construct OpenPAI training service.'); this.log.info('Construct OpenPAI training service.');
...@@ -112,10 +110,8 @@ class PAITrainingService implements TrainingService { ...@@ -112,10 +110,8 @@ class PAITrainingService implements TrainingService {
const jobs: TrialJobDetail[] = []; const jobs: TrialJobDetail[] = [];
for (const [key, value] of this.trialJobsMap) { for (const [key, value] of this.trialJobsMap) {
if (value.form.jobType === 'TRIAL') {
jobs.push(await this.getTrialJob(key)); jobs.push(await this.getTrialJob(key));
} }
}
return Promise.resolve(jobs); return Promise.resolve(jobs);
} }
...@@ -142,7 +138,7 @@ class PAITrainingService implements TrainingService { ...@@ -142,7 +138,7 @@ class PAITrainingService implements TrainingService {
this.metricsEmitter.off('metric', listener); this.metricsEmitter.off('metric', listener);
} }
public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> { public async submitTrialJob(form: TrialJobApplicationForm): Promise<TrialJobDetail> {
if (this.paiClusterConfig === undefined) { if (this.paiClusterConfig === undefined) {
throw new Error(`paiClusterConfig not initialized!`); throw new Error(`paiClusterConfig not initialized!`);
} }
...@@ -151,7 +147,6 @@ class PAITrainingService implements TrainingService { ...@@ -151,7 +147,6 @@ class PAITrainingService implements TrainingService {
this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`);
const trialJobId: string = uniqueString(5); const trialJobId: string = uniqueString(5);
const trialSequenceId: number = this.generateSequenceId();
//TODO: use HDFS working folder instead //TODO: use HDFS working folder instead
const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId);
const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`;
...@@ -171,7 +166,6 @@ class PAITrainingService implements TrainingService { ...@@ -171,7 +166,6 @@ class PAITrainingService implements TrainingService {
Date.now(), Date.now(),
trialWorkingFolder, trialWorkingFolder,
form, form,
trialSequenceId,
hdfsLogPath); hdfsLogPath);
this.trialJobsMap.set(trialJobId, trialJobDetail); this.trialJobsMap.set(trialJobId, trialJobDetail);
...@@ -181,16 +175,12 @@ class PAITrainingService implements TrainingService { ...@@ -181,16 +175,12 @@ class PAITrainingService implements TrainingService {
return deferred.promise; return deferred.promise;
} }
public async updateTrialJob(trialJobId: string, form: JobApplicationForm): Promise<TrialJobDetail> { public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise<TrialJobDetail> {
const trialJobDetail: undefined | TrialJobDetail = this.trialJobsMap.get(trialJobId); const trialJobDetail: undefined | TrialJobDetail = this.trialJobsMap.get(trialJobId);
if (trialJobDetail === undefined) { if (trialJobDetail === undefined) {
throw new Error(`updateTrialJob failed: ${trialJobId} not found`); throw new Error(`updateTrialJob failed: ${trialJobId} not found`);
} }
if (form.jobType === 'TRIAL') { await this.writeParameterFile(trialJobId, form.hyperParameters);
await this.writeParameterFile(trialJobId, (<TrialJobApplicationForm>form).hyperParameters);
} else {
throw new Error(`updateTrialJob failed: jobType ${form.jobType} not supported.`);
}
return trialJobDetail; return trialJobDetail;
} }
...@@ -397,11 +387,10 @@ class PAITrainingService implements TrainingService { ...@@ -397,11 +387,10 @@ class PAITrainingService implements TrainingService {
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });
// Write file content ( parameter.cfg ) to local tmp folders // Write file content ( parameter.cfg ) to local tmp folders
const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>trialJobDetail.form); if (trialJobDetail.form !== undefined) {
if (trialForm !== undefined) {
await fs.promises.writeFile( await fs.promises.writeFile(
path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)), path.join(trialLocalTempFolder, generateParamFileName(trialJobDetail.form.hyperParameters)),
trialForm.hyperParameters.value, { encoding: 'utf8' } trialJobDetail.form.hyperParameters.value, { encoding: 'utf8' }
); );
} }
const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId); const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId);
...@@ -416,7 +405,7 @@ class PAITrainingService implements TrainingService { ...@@ -416,7 +405,7 @@ class PAITrainingService implements TrainingService {
`$PWD/${trialJobId}/nnioutput`, `$PWD/${trialJobId}/nnioutput`,
trialJobId, trialJobId,
this.experimentId, this.experimentId,
trialJobDetail.sequenceId, trialJobDetail.form.sequenceId,
this.isMultiPhase, this.isMultiPhase,
this.paiTrialConfig.command, this.paiTrialConfig.command,
nniManagerIp, nniManagerIp,
...@@ -507,14 +496,6 @@ class PAITrainingService implements TrainingService { ...@@ -507,14 +496,6 @@ class PAITrainingService implements TrainingService {
return deferred.promise; return deferred.promise;
} }
private generateSequenceId(): number {
if (this.nextTrialSequenceId === -1) {
this.nextTrialSequenceId = getInitTrialSequenceId();
}
return this.nextTrialSequenceId++;
}
private async statusCheckingLoop(): Promise<void> { private async statusCheckingLoop(): Promise<void> {
while (!this.stopping) { while (!this.stopping) {
try { try {
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
import * as fs from 'fs'; import * as fs from 'fs';
import { Client, ConnectConfig } from 'ssh2'; import { Client, ConnectConfig } from 'ssh2';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import { GPUInfo, GPUSummary } from '../common/gpuData'; import { GPUInfo, GPUSummary } from '../common/gpuData';
/** /**
...@@ -82,20 +82,18 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail { ...@@ -82,20 +82,18 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
public tags?: string[]; public tags?: string[];
public url?: string; public url?: string;
public workingDirectory: string; public workingDirectory: string;
public form: JobApplicationForm; public form: TrialJobApplicationForm;
public sequenceId: number;
public rmMeta?: RemoteMachineMeta; public rmMeta?: RemoteMachineMeta;
public isEarlyStopped?: boolean; public isEarlyStopped?: boolean;
public gpuIndices: GPUInfo[]; public gpuIndices: GPUInfo[];
constructor(id: string, status: TrialJobStatus, submitTime: number, constructor(id: string, status: TrialJobStatus, submitTime: number,
workingDirectory: string, form: JobApplicationForm, sequenceId: number) { workingDirectory: string, form: TrialJobApplicationForm) {
this.id = id; this.id = id;
this.status = status; this.status = status;
this.submitTime = submitTime; this.submitTime = submitTime;
this.workingDirectory = workingDirectory; this.workingDirectory = workingDirectory;
this.form = form; this.form = form;
this.sequenceId = sequenceId;
this.tags = []; this.tags = [];
this.gpuIndices = []; this.gpuIndices = [];
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment