Unverified Commit 3d221da9 authored by fishyds's avatar fishyds Committed by GitHub
Browse files

Merge latest code changes into Github Master (#54)

* Merge latest code changes into Github Master

* temporary modification for travis

* temporary modification for travis
parent c015421c
FROM nni.build.base:cuda9.0-cudnn7-devel-ubuntu16.04
LABEL maintainer='Microsoft NNI Team<nni@microsoft.com>'
#
#Tensorflow 1.10.0
#
RUN pip3 --no-cache-dir install tensorflow-gpu==1.10.0
#
#Keras 2.1.6
#
RUN pip3 --no-cache-dir install Keras==2.1.6
\ No newline at end of file
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
LABEL maintainer='Microsoft NNI Team<nni@microsoft.com>'
RUN apt-get update && apt-get install -y --no-install-recommends \
sudo apt-utils git curl vim unzip openssh-client wget \
build-essential cmake \
libopenblas-dev
#
# Python 3.5
#
RUN apt-get install -y --no-install-recommends python3.5 python3.5-dev python3-pip python3-tk && \
pip3 install --no-cache-dir --upgrade pip setuptools && \
echo "alias python='python3'" >> /root/.bash_aliases && \
echo "alias pip='pip3'" >> /root/.bash_aliases
# numpy 1.14.3 scipy 1.1.0
RUN pip3 --no-cache-dir install \
numpy==1.14.3 scipy==1.1.0
#
#Install node 10.9.0, yarn 1.9.4, NNI v0.1
#
RUN git clone -b v0.1 https://github.com/Microsoft/nni.git
RUN cd nni && sh install.sh
RUN echo 'PATH=~/.local/node/bin:~/.local/yarn/bin:~/.local/bin:$PATH' >> ~/.bashrc
RUN cd .. && rm -rf nni
Dockerfile
===
## 1.Description
This is the Dockerfile of nni project, including the most kinds of deeplearning frameworks and nni source code. You can run your nni experiment in this docker container directly.
## 2.Including Libraries
```
Ubuntu 16.04 LTS
CUDA 9.0, CuDNN 7.0
numpy 1.14.3,scipy 1.1.0
TensorFlow 1.5.0
Keras 2.1.6
NNI v0.1
```
## 3 How to run
docker build -f Dockerfile.build.base -t nni.build.base:cuda9.0-cudnn7-devel-ubuntu16.04 .
docker build -t nni/nni .
nvidia-docker run -it nni/nni
\ No newline at end of file
...@@ -80,10 +80,17 @@ setup( ...@@ -80,10 +80,17 @@ setup(
'psutil', 'psutil',
'pyyaml', 'pyyaml',
'requests', 'requests',
'scipy' 'scipy',
'schema'
],
dependency_links = [
'git+https://github.com/hyperopt/hyperopt.git'
], ],
cmdclass={ cmdclass={
'install': CustomInstallCommand 'install': CustomInstallCommand
},
entry_points={
'console_scripts': ['nnictl = nnicmd.nnictl:parse_args']
} }
) )
...@@ -26,14 +26,14 @@ type TrialJobEvent = TrialJobStatus | 'USER_TO_CANCEL' | 'ADD_CUSTOMIZED'; ...@@ -26,14 +26,14 @@ type TrialJobEvent = TrialJobStatus | 'USER_TO_CANCEL' | 'ADD_CUSTOMIZED';
type MetricType = 'PERIODICAL' | 'FINAL' | 'CUSTOM'; type MetricType = 'PERIODICAL' | 'FINAL' | 'CUSTOM';
interface ExperimentProfileRecord { interface ExperimentProfileRecord {
readonly timestamp: Date; readonly timestamp: number;
readonly experimentId: number; readonly experimentId: number;
readonly revision: number; readonly revision: number;
readonly data: ExperimentProfile; readonly data: ExperimentProfile;
} }
interface TrialJobEventRecord { interface TrialJobEventRecord {
readonly timestamp: Date; readonly timestamp: number;
readonly trialJobId: string; readonly trialJobId: string;
readonly event: TrialJobEvent; readonly event: TrialJobEvent;
readonly data?: string; readonly data?: string;
...@@ -49,7 +49,7 @@ interface MetricData { ...@@ -49,7 +49,7 @@ interface MetricData {
} }
interface MetricDataRecord { interface MetricDataRecord {
readonly timestamp: Date; readonly timestamp: number;
readonly trialJobId: string; readonly trialJobId: string;
readonly parameterId: string; readonly parameterId: string;
readonly type: MetricType; readonly type: MetricType;
...@@ -60,8 +60,8 @@ interface MetricDataRecord { ...@@ -60,8 +60,8 @@ interface MetricDataRecord {
interface TrialJobInfo { interface TrialJobInfo {
id: string; id: string;
status: TrialJobStatus; status: TrialJobStatus;
startTime?: Date; startTime?: number;
endTime?: Date; endTime?: number;
hyperParameters?: string; hyperParameters?: string;
logPath?: string; logPath?: string;
finalMetricData?: string; finalMetricData?: string;
...@@ -96,4 +96,4 @@ abstract class Database { ...@@ -96,4 +96,4 @@ abstract class Database {
export { export {
DataStore, Database, TrialJobEvent, MetricType, MetricData, TrialJobInfo, DataStore, Database, TrialJobEvent, MetricType, MetricData, TrialJobInfo,
ExperimentProfileRecord, TrialJobEventRecord, MetricDataRecord ExperimentProfileRecord, TrialJobEventRecord, MetricDataRecord
} };
...@@ -59,8 +59,8 @@ interface ExperimentProfile { ...@@ -59,8 +59,8 @@ interface ExperimentProfile {
params: ExperimentParams; params: ExperimentParams;
id: string; id: string;
execDuration: number; execDuration: number;
startTime?: Date; startTime?: number;
endTime?: Date; endTime?: number;
revision: number; revision: number;
} }
...@@ -69,6 +69,11 @@ interface TrialJobStatistics { ...@@ -69,6 +69,11 @@ interface TrialJobStatistics {
trialJobNumber: number; trialJobNumber: number;
} }
interface NNIManagerStatus {
status: 'INITIALIZED' | 'EXPERIMENT_RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED';
errors: string[];
}
abstract class Manager { abstract class Manager {
public abstract startExperiment(experimentParams: ExperimentParams): Promise<string>; public abstract startExperiment(experimentParams: ExperimentParams): Promise<string>;
public abstract resumeExperiment(): Promise<void>; public abstract resumeExperiment(): Promise<void>;
...@@ -86,6 +91,7 @@ abstract class Manager { ...@@ -86,6 +91,7 @@ abstract class Manager {
public abstract getMetricData(trialJobId: string, metricType: MetricType): Promise<MetricDataRecord[]>; public abstract getMetricData(trialJobId: string, metricType: MetricType): Promise<MetricDataRecord[]>;
public abstract getTrialJobStatistics(): Promise<TrialJobStatistics[]>; public abstract getTrialJobStatistics(): Promise<TrialJobStatistics[]>;
public abstract getStatus(): NNIManagerStatus;
} }
export { Manager, ExperimentParams, ExperimentProfile, TrialJobStatistics, ProfileUpdateType }; export { Manager, ExperimentParams, ExperimentProfile, TrialJobStatistics, ProfileUpdateType, NNIManagerStatus };
...@@ -19,27 +19,26 @@ ...@@ -19,27 +19,26 @@
'use strict'; 'use strict';
import * as bodyParser from 'body-parser';
import * as express from 'express'; import * as express from 'express';
import * as http from 'http'; import * as http from 'http';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import { getLogger, Logger } from './log';
import * as component from '../common/component'; /**
import { getLogger, Logger } from '../common/log'; * Abstraction class to create a RestServer
import { Manager } from '../common/manager'; * The module who wants to use a RestServer could <b>extends</b> this abstract class
import { createRestHandler } from './restHandler'; * And implement its own registerRestHandler() function to register routers
*/
@component.Singleton export abstract class RestServer {
export class RestServer {
public static readonly DEFAULT_PORT: number = 51188;
private readonly API_ROOT_URL: string = '/api/v1/nni';
private hostName: string = '0.0.0.0';
private port: number = RestServer.DEFAULT_PORT;
private startTask!: Deferred<void>; private startTask!: Deferred<void>;
private stopTask!: Deferred<void>; private stopTask!: Deferred<void>;
private app: express.Application = express();
private server!: http.Server; private server!: http.Server;
private log: Logger = getLogger();
/** The fields can be inherited by subclass */
protected hostName: string = '0.0.0.0';
protected port?: number;
protected app: express.Application = express();
protected log: Logger = getLogger();
get endPoint(): string { get endPoint(): string {
// tslint:disable-next-line:no-http-string // tslint:disable-next-line:no-http-string
...@@ -61,7 +60,7 @@ export class RestServer { ...@@ -61,7 +60,7 @@ export class RestServer {
this.port = port; this.port = port;
} }
this.server = this.app.listen(this.port, this.hostName).on('listening', () => { this.server = this.app.listen(this.port as number, this.hostName).on('listening', () => {
this.startTask.resolve(); this.startTask.resolve();
}).on('error', (e: Error) => { }).on('error', (e: Error) => {
this.startTask.reject(e); this.startTask.reject(e);
...@@ -100,8 +99,8 @@ export class RestServer { ...@@ -100,8 +99,8 @@ export class RestServer {
return this.stopTask.promise; return this.stopTask.promise;
} }
private registerRestHandler(): void { /**
this.app.use(bodyParser.json()); * Register REST handler, which is left for subclass to implement
this.app.use(this.API_ROOT_URL, createRestHandler(this)); */
} protected abstract registerRestHandler(): void;
} }
...@@ -58,9 +58,9 @@ interface HostJobApplicationForm extends JobApplicationForm { ...@@ -58,9 +58,9 @@ interface HostJobApplicationForm extends JobApplicationForm {
interface TrialJobDetail { interface TrialJobDetail {
readonly id: string; readonly id: string;
readonly status: TrialJobStatus; readonly status: TrialJobStatus;
readonly submitTime: Date; readonly submitTime: number;
readonly startTime?: Date; readonly startTime?: number;
readonly endTime?: Date; readonly endTime?: number;
readonly tags?: string[]; readonly tags?: string[];
readonly url?: string; readonly url?: string;
readonly workingDirectory: string; readonly workingDirectory: string;
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
'use strict'; 'use strict';
import * as assert from 'assert';
import { randomBytes } from 'crypto'; import { randomBytes } from 'crypto';
import * as fs from 'fs'; import * as fs from 'fs';
import * as os from 'os'; import * as os from 'os';
...@@ -32,7 +33,7 @@ import { ExperimentStartupInfo, getExperimentId, setExperimentStartupInfo } from ...@@ -32,7 +33,7 @@ import { ExperimentStartupInfo, getExperimentId, setExperimentStartupInfo } from
import { Manager } from './manager'; import { Manager } from './manager';
import { TrainingService } from './trainingService'; import { TrainingService } from './trainingService';
function getExperimentRootDir(): string{ function getExperimentRootDir(): string {
return path.join(os.homedir(), 'nni', 'experiments', getExperimentId()); return path.join(os.homedir(), 'nni', 'experiments', getExperimentId());
} }
...@@ -115,6 +116,12 @@ function uniqueString(len: number): string { ...@@ -115,6 +116,12 @@ function uniqueString(len: number): string {
return String.fromCharCode(...codes); return String.fromCharCode(...codes);
} }
function randomSelect<T>(a: T[]): T {
assert(a !== undefined);
// tslint:disable-next-line:insecure-random
return a[Math.floor(Math.random() * a.length)];
}
function parseArg(names: string[]): string { function parseArg(names: string[]): string {
if (process.argv.length >= 4) { if (process.argv.length >= 4) {
for (let i: number = 2; i < process.argv.length - 1; i++) { for (let i: number = 2; i < process.argv.length - 1; i++) {
...@@ -223,4 +230,4 @@ function cleanupUnitTest(): void { ...@@ -223,4 +230,4 @@ function cleanupUnitTest(): void {
} }
export { getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getDefaultDatabaseDir, mkDirP, delay, prepareUnitTest, export { getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getDefaultDatabaseDir, mkDirP, delay, prepareUnitTest,
parseArg, cleanupUnitTest, uniqueString }; parseArg, cleanupUnitTest, uniqueString, randomSelect };
...@@ -126,7 +126,7 @@ class NNIDataStore implements DataStore { ...@@ -126,7 +126,7 @@ class NNIDataStore implements DataStore {
type: metrics.type, type: metrics.type,
sequence: metrics.sequence, sequence: metrics.sequence,
data: metrics.value, data: metrics.value,
timestamp: new Date() timestamp: Date.now()
})); }));
} }
......
...@@ -29,7 +29,7 @@ import { getExperimentId } from '../common/experimentStartupInfo'; ...@@ -29,7 +29,7 @@ import { getExperimentId } from '../common/experimentStartupInfo';
import { getLogger, Logger } from '../common/log'; import { getLogger, Logger } from '../common/log';
import { import {
ExperimentParams, ExperimentProfile, Manager, ExperimentParams, ExperimentProfile, Manager,
ProfileUpdateType, TrialJobStatistics NNIManagerStatus, ProfileUpdateType, TrialJobStatistics
} from '../common/manager'; } from '../common/manager';
import { import {
TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus
...@@ -56,41 +56,26 @@ class NNIManager implements Manager { ...@@ -56,41 +56,26 @@ class NNIManager implements Manager {
private dataStore: DataStore; private dataStore: DataStore;
private experimentProfile: ExperimentProfile; private experimentProfile: ExperimentProfile;
private dispatcherPid: number; private dispatcherPid: number;
private status: NNIManagerStatus;
constructor() { constructor() {
this.currSubmittedTrialNum = 0; this.currSubmittedTrialNum = 0;
this.trialConcurrencyReduction = 0; this.trialConcurrencyReduction = 0;
this.customizedTrials = []; this.customizedTrials = [];
const experimentId: string = getExperimentId();
this.trainingService = component.get(TrainingService); this.trainingService = component.get(TrainingService);
assert(this.trainingService); assert(this.trainingService);
this.dispatcherPid = 0; this.dispatcherPid = 0;
this.log = getLogger(); this.log = getLogger();
this.dataStore = component.get(DataStore); this.dataStore = component.get(DataStore);
this.experimentProfile = { this.experimentProfile = this.createEmptyExperimentProfile();
id: experimentId, this.status = {
revision: 0, status: 'INITIALIZED',
execDuration: 0, errors: []
params: {
authorName: '',
experimentName: '',
trialConcurrency: 0,
maxExecDuration: 0, // unit: second
maxTrialNum: 0, // maxTrialNum includes all the submitted trial jobs
searchSpace: '',
tuner: {
className: '',
classArgs: {},
checkpointDir: ''
}
}
}; };
} }
public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void> { public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void> {
// TO DO: remove this line, and let rest server do data type validation
experimentProfile.startTime = new Date(<string><any>experimentProfile.startTime);
switch (updateType) { switch (updateType) {
case 'TRIAL_CONCURRENCY': case 'TRIAL_CONCURRENCY':
this.updateTrialConcurrency(experimentProfile.params.trialConcurrency); this.updateTrialConcurrency(experimentProfile.params.trialConcurrency);
...@@ -140,10 +125,11 @@ class NNIManager implements Manager { ...@@ -140,10 +125,11 @@ class NNIManager implements Manager {
'start', 'start',
expParams.tuner.checkpointDir); expParams.tuner.checkpointDir);
this.experimentProfile.startTime = new Date(); this.experimentProfile.startTime = Date.now();
this.status.status = 'EXPERIMENT_RUNNING';
await this.storeExperimentProfile(); await this.storeExperimentProfile();
this.run().catch(err => { this.run().catch((err: Error) => {
this.log.error(err.stack); this.criticalError(err);
}); });
return this.experimentProfile.id; return this.experimentProfile.id;
} }
...@@ -172,6 +158,8 @@ class NNIManager implements Manager { ...@@ -172,6 +158,8 @@ class NNIManager implements Manager {
.filter((job: TrialJobInfo) => job.status === 'WAITING' || job.status === 'RUNNING') .filter((job: TrialJobInfo) => job.status === 'WAITING' || job.status === 'RUNNING')
.map((job: TrialJobInfo) => this.dataStore.storeTrialJobEvent('FAILED', job.id))); .map((job: TrialJobInfo) => this.dataStore.storeTrialJobEvent('FAILED', job.id)));
this.status.status = 'EXPERIMENT_RUNNING';
// TO DO: update database record for resume event // TO DO: update database record for resume event
this.run().catch(console.error); this.run().catch(console.error);
} }
...@@ -206,6 +194,7 @@ class NNIManager implements Manager { ...@@ -206,6 +194,7 @@ class NNIManager implements Manager {
} }
public stopExperiment(): Promise<void> { public stopExperiment(): Promise<void> {
this.status.status = 'STOPPING';
if (this.trialJobsMaintainer !== undefined) { if (this.trialJobsMaintainer !== undefined) {
this.trialJobsMaintainer.setStopLoop(); this.trialJobsMaintainer.setStopLoop();
...@@ -227,6 +216,10 @@ class NNIManager implements Manager { ...@@ -227,6 +216,10 @@ class NNIManager implements Manager {
return deferred.promise; return deferred.promise;
} }
public getStatus(): NNIManagerStatus {
return this.status;
}
public async listTrialJobs(status?: TrialJobStatus): Promise<TrialJobInfo[]> { public async listTrialJobs(status?: TrialJobStatus): Promise<TrialJobInfo[]> {
return this.dataStore.listTrialJobs(status); return this.dataStore.listTrialJobs(status);
} }
...@@ -329,16 +322,17 @@ class NNIManager implements Manager { ...@@ -329,16 +322,17 @@ class NNIManager implements Manager {
} }
} }
await this.trainingService.cleanUp(); await this.trainingService.cleanUp();
this.experimentProfile.endTime = new Date(); this.experimentProfile.endTime = Date.now();
await this.storeExperimentProfile(); await this.storeExperimentProfile();
this.status.status = 'STOPPED';
} }
private async periodicallyUpdateExecDuration(): Promise<void> { private async periodicallyUpdateExecDuration(): Promise<void> {
const startTime: Date = new Date(); const startTime: number = Date.now();
const execDuration: number = this.experimentProfile.execDuration; const execDuration: number = this.experimentProfile.execDuration;
for (; ;) { for (; ;) {
await delay(1000 * 60 * 10); // 10 minutes await delay(1000 * 60 * 10); // 10 minutes
this.experimentProfile.execDuration = execDuration + (Date.now() - startTime.getTime()) / 1000; this.experimentProfile.execDuration = execDuration + (Date.now() - startTime) / 1000;
await this.storeExperimentProfile(); await this.storeExperimentProfile();
} }
} }
...@@ -349,25 +343,71 @@ class NNIManager implements Manager { ...@@ -349,25 +343,71 @@ class NNIManager implements Manager {
return this.dataStore.storeExperimentProfile(this.experimentProfile); return this.dataStore.storeExperimentProfile(this.experimentProfile);
} }
// tslint:disable-next-line:max-func-body-length private async run(): Promise<void> {
private runInternal(): Promise<void> { this.trialJobsMaintainer = new TrialJobs(
this.trainingService,
this.experimentProfile.execDuration,
this.experimentProfile.params.maxExecDuration);
assert(this.dispatcher !== undefined && this.trialJobsMaintainer !== undefined);
this.addEventListeners();
this.sendInitTunerCommands();
await Promise.all([
this.periodicallyUpdateExecDuration(),
this.trainingService.run(),
this.trialJobsMaintainer.run()]);
}
private addEventListeners(): void {
// TO DO: cannot run this method more than once in one NNIManager instance // TO DO: cannot run this method more than once in one NNIManager instance
if (this.dispatcher === undefined || this.trialJobsMaintainer === undefined) {
throw new Error('Error: tuner or job maintainer have not been setup');
}
this.trainingService.addTrialJobMetricListener((metric: TrialJobMetric) => {
this.onTrialJobMetrics(metric).catch((err: Error) => {
this.criticalError(err);
});
});
this.trialJobsMaintainer.on(async (event: TrialJobMaintainerEvent, trialJobDetail: TrialJobDetail) => {
this.onTrialJobEvent(event, trialJobDetail).catch((err: Error) => {
this.criticalError(err);
});
});
this.dispatcher.onCommand((commandType: string, content: string) => {
this.onTunerCommand(commandType, content).catch((err: Error) => {
this.criticalError(err);
});
});
}
private sendInitTunerCommands(): void {
if (this.dispatcher === undefined) { if (this.dispatcher === undefined) {
throw new Error('Error: tuner has not been setup'); throw new Error('Error: tuner has not been setup');
} }
this.trainingService.addTrialJobMetricListener(async (metric: TrialJobMetric) => { // TO DO: we should send INITIALIZE command to tuner if user's tuner needs to run init method in tuner
this.log.debug(`Send tuner command: update search space: ${this.experimentProfile.params.searchSpace}`);
this.dispatcher.sendCommand(UPDATE_SEARCH_SPACE, this.experimentProfile.params.searchSpace);
if (this.trialConcurrencyReduction !== 0) {
throw new Error('Error: cannot modify trialConcurrency before startExperiment');
}
this.log.debug(`Send tuner command: ${this.experimentProfile.params.trialConcurrency}`);
this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, String(this.experimentProfile.params.trialConcurrency));
}
private async onTrialJobMetrics(metric: TrialJobMetric): Promise<void> {
await this.dataStore.storeMetricData(metric.id, metric.data); await this.dataStore.storeMetricData(metric.id, metric.data);
if (this.dispatcher === undefined) { if (this.dispatcher === undefined) {
throw new Error('Error: tuner has not been setup'); throw new Error('Error: tuner has not been setup');
} }
this.dispatcher.sendCommand(REPORT_METRIC_DATA, metric.data); this.dispatcher.sendCommand(REPORT_METRIC_DATA, metric.data);
}); }
this.trialJobsMaintainer = new TrialJobs( private async onTrialJobEvent(event: TrialJobMaintainerEvent, trialJobDetail: TrialJobDetail): Promise<void> {
this.trainingService,
this.experimentProfile.execDuration,
this.experimentProfile.params.maxExecDuration);
this.trialJobsMaintainer.on(async (event: TrialJobMaintainerEvent, trialJobDetail: TrialJobDetail) => {
if (trialJobDetail !== undefined) { if (trialJobDetail !== undefined) {
this.log.debug(`Job event: ${event}, id: ${trialJobDetail.id}`); this.log.debug(`Job event: ${event}, id: ${trialJobDetail.id}`);
} else { } else {
...@@ -407,17 +447,9 @@ class NNIManager implements Manager { ...@@ -407,17 +447,9 @@ class NNIManager implements Manager {
default: default:
throw new Error('Error: unrecognized event from trialJobsMaintainer'); throw new Error('Error: unrecognized event from trialJobsMaintainer');
} }
});
// TO DO: we should send INITIALIZE command to tuner if user's tuner needs to run init method in tuner
this.log.debug(`Send tuner command: update search space: ${this.experimentProfile.params.searchSpace}`);
this.dispatcher.sendCommand(UPDATE_SEARCH_SPACE, this.experimentProfile.params.searchSpace);
if (this.trialConcurrencyReduction !== 0) {
return Promise.reject(new Error('Error: cannot modify trialConcurrency before startExperiment'));
} }
this.log.debug(`Send tuner command: ${this.experimentProfile.params.trialConcurrency}`)
this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, String(this.experimentProfile.params.trialConcurrency)); private async onTunerCommand(commandType: string, content: string): Promise<void> {
this.dispatcher.onCommand(async (commandType: string, content: string) => {
this.log.info(`Command from tuner: ${commandType}, ${content}`); this.log.info(`Command from tuner: ${commandType}, ${content}`);
if (this.trialJobsMaintainer === undefined) { if (this.trialJobsMaintainer === undefined) {
throw new Error('Error: trialJobsMaintainer not initialized'); throw new Error('Error: trialJobsMaintainer not initialized');
...@@ -432,6 +464,7 @@ class NNIManager implements Manager { ...@@ -432,6 +464,7 @@ class NNIManager implements Manager {
}; };
const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(trialJobAppForm); const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(trialJobAppForm);
this.trialJobsMaintainer.setTrialJob(trialJobDetail.id, Object.assign({}, trialJobDetail)); this.trialJobsMaintainer.setTrialJob(trialJobDetail.id, Object.assign({}, trialJobDetail));
// TO DO: to uncomment
assert(trialJobDetail.status === 'WAITING'); assert(trialJobDetail.status === 'WAITING');
await this.dataStore.storeTrialJobEvent(trialJobDetail.status, trialJobDetail.id, content, trialJobDetail.url); await this.dataStore.storeTrialJobEvent(trialJobDetail.status, trialJobDetail.id, content, trialJobDetail.url);
if (this.currSubmittedTrialNum === this.experimentProfile.params.maxTrialNum) { if (this.currSubmittedTrialNum === this.experimentProfile.params.maxTrialNum) {
...@@ -446,18 +479,42 @@ class NNIManager implements Manager { ...@@ -446,18 +479,42 @@ class NNIManager implements Manager {
await this.trainingService.cancelTrialJob(JSON.parse(content)); await this.trainingService.cancelTrialJob(JSON.parse(content));
break; break;
default: default:
throw new Error(`Error: unsupported command type: [${commandType}]`); throw new Error('Error: unsupported command type from tuner');
}
} }
});
return this.trialJobsMaintainer.run(); private criticalError(err: Error): void {
this.logError(err);
console.error(err);
} }
private async run(): Promise<void> { private logError(err: Error): void {
await Promise.all([ if (err.stack !== undefined) {
this.periodicallyUpdateExecDuration(), this.log.error(err.stack);
this.trainingService.run(), }
this.runInternal()]); this.status.errors.push(err.message);
this.status.status = 'ERROR';
}
private createEmptyExperimentProfile(): ExperimentProfile {
return {
id: getExperimentId(),
revision: 0,
execDuration: 0,
params: {
authorName: '',
experimentName: '',
trialConcurrency: 0,
maxExecDuration: 0, // unit: second
maxTrialNum: 0, // maxTrialNum includes all the submitted trial jobs
searchSpace: '',
tuner: {
className: '',
classArgs: {},
checkpointDir: ''
}
}
};
} }
} }
......
...@@ -60,15 +60,15 @@ function loadExperimentProfile(row: any): ExperimentProfile { ...@@ -60,15 +60,15 @@ function loadExperimentProfile(row: any): ExperimentProfile {
params: JSON.parse(row.params), params: JSON.parse(row.params),
id: row.id, id: row.id,
execDuration: row.execDuration, execDuration: row.execDuration,
startTime: row.startTime === null ? undefined : new Date(row.startTime), startTime: row.startTime === null ? undefined : row.startTime,
endTime: row.endTime === null ? undefined : new Date(row.endTime), endTime: row.endTime === null ? undefined : row.endTime,
revision: row.revision revision: row.revision
}; };
} }
function loadTrialJobEvent(row: any): TrialJobEventRecord { function loadTrialJobEvent(row: any): TrialJobEventRecord {
return { return {
timestamp: new Date(row.timestamp), timestamp: row.timestamp,
trialJobId: row.trialJobId, trialJobId: row.trialJobId,
event: row.event, event: row.event,
data: row.data === null ? undefined : row.data, data: row.data === null ? undefined : row.data,
...@@ -78,7 +78,7 @@ function loadTrialJobEvent(row: any): TrialJobEventRecord { ...@@ -78,7 +78,7 @@ function loadTrialJobEvent(row: any): TrialJobEventRecord {
function loadMetricData(row: any): MetricDataRecord { function loadMetricData(row: any): MetricDataRecord {
return { return {
timestamp: new Date(row.timestamp), timestamp: row.timestamp,
trialJobId: row.trialJobId, trialJobId: row.trialJobId,
parameterId: row.parameterId, parameterId: row.parameterId,
type: row.type, type: row.type,
...@@ -132,8 +132,8 @@ class SqlDB implements Database { ...@@ -132,8 +132,8 @@ class SqlDB implements Database {
JSON.stringify(exp.params), JSON.stringify(exp.params),
exp.id, exp.id,
exp.execDuration, exp.execDuration,
exp.startTime === undefined ? null : exp.startTime.getTime(), exp.startTime === undefined ? null : exp.startTime,
exp.endTime === undefined ? null : exp.endTime.getTime(), exp.endTime === undefined ? null : exp.endTime,
exp.revision exp.revision
]; ];
......
...@@ -76,8 +76,8 @@ describe('Unit test for dataStore', () => { ...@@ -76,8 +76,8 @@ describe('Unit test for dataStore', () => {
}, },
id: 'exp123', id: 'exp123',
execDuration: 0, execDuration: 0,
startTime: new Date(), startTime: Date.now(),
endTime: new Date(), endTime: Date.now(),
revision: 0 revision: 0
} }
const id: string = profile.id; const id: string = profile.id;
...@@ -128,14 +128,14 @@ describe('Unit test for dataStore', () => { ...@@ -128,14 +128,14 @@ describe('Unit test for dataStore', () => {
parameter_id: 'abc', parameter_id: 'abc',
type: 'PERIODICAL', type: 'PERIODICAL',
value: 'acc: 0.88', value: 'acc: 0.88',
timestamp: new Date() timestamp: Date.now()
}, },
{ {
trial_job_id: '111', trial_job_id: '111',
parameter_id: 'abc', parameter_id: 'abc',
type: 'FINAL', type: 'FINAL',
value: 'acc: 0.88', value: 'acc: 0.88',
timestamp: new Date() timestamp: Date.now()
} }
]; ];
......
...@@ -118,7 +118,7 @@ class MockedDataStore implements DataStore { ...@@ -118,7 +118,7 @@ class MockedDataStore implements DataStore {
async storeTrialJobEvent(event: TrialJobEvent, trialJobId: string, data?: string | undefined): Promise<void> { async storeTrialJobEvent(event: TrialJobEvent, trialJobId: string, data?: string | undefined): Promise<void> {
const dataRecord: TrialJobEventRecord = { const dataRecord: TrialJobEventRecord = {
event: event, event: event,
timestamp: new Date(), timestamp: Date.now(),
trialJobId: trialJobId, trialJobId: trialJobId,
data: data data: data
} }
...@@ -175,7 +175,7 @@ class MockedDataStore implements DataStore { ...@@ -175,7 +175,7 @@ class MockedDataStore implements DataStore {
parameterId: metrics.parameter_id, parameterId: metrics.parameter_id,
type: metrics.type, type: metrics.type,
data: metrics.value, data: metrics.value,
timestamp: new Date() timestamp: Date.now()
}); });
} }
...@@ -234,13 +234,13 @@ class MockedDataStore implements DataStore { ...@@ -234,13 +234,13 @@ class MockedDataStore implements DataStore {
} }
switch (record.event) { switch (record.event) {
case 'RUNNING': case 'RUNNING':
jobInfo.startTime = new Date(); jobInfo.startTime = Date.now();
break; break;
case 'SUCCEEDED': case 'SUCCEEDED':
case 'FAILED': case 'FAILED':
case 'USER_CANCELED': case 'USER_CANCELED':
case 'SYS_CANCELED': case 'SYS_CANCELED':
jobInfo.endTime = new Date(); jobInfo.endTime = Date.now();
} }
jobInfo.status = this.getJobStatusByLatestEvent(record.event); jobInfo.status = this.getJobStatusByLatestEvent(record.event);
map.set(record.trialJobId, jobInfo); map.set(record.trialJobId, jobInfo);
......
...@@ -34,9 +34,9 @@ class MockedTrainingService extends TrainingService { ...@@ -34,9 +34,9 @@ class MockedTrainingService extends TrainingService {
public jobDetail1: TrialJobDetail = { public jobDetail1: TrialJobDetail = {
id: '1234', id: '1234',
status: 'SUCCEEDED', status: 'SUCCEEDED',
submitTime: new Date(), submitTime: Date.now(),
startTime: new Date(), startTime: Date.now(),
endTime: new Date(), endTime: Date.now(),
tags: ['test'], tags: ['test'],
url: 'http://test', url: 'http://test',
workingDirectory: '/tmp/mocked', workingDirectory: '/tmp/mocked',
...@@ -47,9 +47,9 @@ class MockedTrainingService extends TrainingService { ...@@ -47,9 +47,9 @@ class MockedTrainingService extends TrainingService {
public jobDetail2: TrialJobDetail = { public jobDetail2: TrialJobDetail = {
id: '3456', id: '3456',
status: 'SUCCEEDED', status: 'SUCCEEDED',
submitTime: new Date(), submitTime: Date.now(),
startTime: new Date(), startTime: Date.now(),
endTime: new Date(), endTime: Date.now(),
tags: ['test'], tags: ['test'],
url: 'http://test', url: 'http://test',
workingDirectory: '/tmp/mocked', workingDirectory: '/tmp/mocked',
......
...@@ -62,33 +62,33 @@ const expParams2: ExperimentParams = { ...@@ -62,33 +62,33 @@ const expParams2: ExperimentParams = {
}; };
const profiles: ExperimentProfile[] = [ const profiles: ExperimentProfile[] = [
{ params: expParams1, id: '#1', execDuration: 0, startTime: new Date(), endTime: undefined, revision: 1 }, { params: expParams1, id: '#1', execDuration: 0, startTime: Date.now(), endTime: undefined, revision: 1 },
{ params: expParams1, id: '#1', execDuration: 0, startTime: new Date(), endTime: new Date(), revision: 2 }, { params: expParams1, id: '#1', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 2 },
{ params: expParams2, id: '#2', execDuration: 0, startTime: new Date(), endTime: new Date(), revision: 2 }, { params: expParams2, id: '#2', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 2 },
{ params: expParams2, id: '#2', execDuration: 0, startTime: new Date(), endTime: new Date(), revision: 3 } { params: expParams2, id: '#2', execDuration: 0, startTime: Date.now(), endTime: Date.now(), revision: 3 }
]; ];
const events: TrialJobEventRecord[] = [ const events: TrialJobEventRecord[] = [
{ timestamp: new Date(), event: 'WAITING', trialJobId: 'A', data: 'hello' }, // 0 { timestamp: Date.now(), event: 'WAITING', trialJobId: 'A', data: 'hello' }, // 0
{ timestamp: new Date(), event: 'UNKNOWN', trialJobId: 'B', data: 'world' }, // 1 { timestamp: Date.now(), event: 'UNKNOWN', trialJobId: 'B', data: 'world' }, // 1
{ timestamp: new Date(), event: 'RUNNING', trialJobId: 'B', data: undefined }, // 2 { timestamp: Date.now(), event: 'RUNNING', trialJobId: 'B', data: undefined }, // 2
{ timestamp: new Date(), event: 'RUNNING', trialJobId: 'A', data: '123' }, // 3 { timestamp: Date.now(), event: 'RUNNING', trialJobId: 'A', data: '123' }, // 3
{ timestamp: new Date(), event: 'FAILED', trialJobId: 'A', data: undefined } // 4 { timestamp: Date.now(), event: 'FAILED', trialJobId: 'A', data: undefined } // 4
]; ];
const metrics: MetricDataRecord[] = [ const metrics: MetricDataRecord[] = [
{ timestamp: new Date(), trialJobId: 'A', parameterId: '1', type: 'PERIODICAL', sequence: 0, data: 1.1 }, // 0 { timestamp: Date.now(), trialJobId: 'A', parameterId: '1', type: 'PERIODICAL', sequence: 0, data: 1.1 }, // 0
{ timestamp: new Date(), trialJobId: 'B', parameterId: '2', type: 'PERIODICAL', sequence: 0, data: 2.1 }, // 1 { timestamp: Date.now(), trialJobId: 'B', parameterId: '2', type: 'PERIODICAL', sequence: 0, data: 2.1 }, // 1
{ timestamp: new Date(), trialJobId: 'A', parameterId: '1', type: 'PERIODICAL', sequence: 1, data: 1.2 }, // 2 { timestamp: Date.now(), trialJobId: 'A', parameterId: '1', type: 'PERIODICAL', sequence: 1, data: 1.2 }, // 2
{ timestamp: new Date(), trialJobId: 'A', parameterId: '1', type: 'FINAL', sequence: 0, data: 1.3 }, // 3 { timestamp: Date.now(), trialJobId: 'A', parameterId: '1', type: 'FINAL', sequence: 0, data: 1.3 }, // 3
{ timestamp: new Date(), trialJobId: 'C', parameterId: '2', type: 'PERIODICAL', sequence: 1, data: 2.1 }, // 4 { timestamp: Date.now(), trialJobId: 'C', parameterId: '2', type: 'PERIODICAL', sequence: 1, data: 2.1 }, // 4
{ timestamp: new Date(), trialJobId: 'C', parameterId: '2', type: 'FINAL', sequence: 0, data: 2.2 } // 5 { timestamp: Date.now(), trialJobId: 'C', parameterId: '2', type: 'FINAL', sequence: 0, data: 2.2 } // 5
]; ];
// tslint:disable-next-line:no-any // tslint:disable-next-line:no-any
function assertRecordEqual(record: any, value: any): void { function assertRecordEqual(record: any, value: any): void {
assert.ok(record.timestamp > new Date(2018, 6, 1)); assert.ok(record.timestamp > new Date(2018, 6, 1).getTime());
assert.ok(record.timestamp < new Date()); assert.ok(record.timestamp < Date.now());
for (const key in value) { // tslint:disable-line:no-for-in for (const key in value) { // tslint:disable-line:no-for-in
if (key !== 'timestamp') { if (key !== 'timestamp') {
......
...@@ -26,6 +26,9 @@ import { delay } from '../common/utils'; ...@@ -26,6 +26,9 @@ import { delay } from '../common/utils';
type TrialJobMaintainerEvent = TrialJobStatus | 'EXPERIMENT_DONE'; type TrialJobMaintainerEvent = TrialJobStatus | 'EXPERIMENT_DONE';
/**
* TrialJobs
*/
class TrialJobs { class TrialJobs {
private eventEmitter: EventEmitter; private eventEmitter: EventEmitter;
private trialJobs: Map<string, TrialJobDetail>; private trialJobs: Map<string, TrialJobDetail>;
...@@ -93,9 +96,9 @@ class TrialJobs { ...@@ -93,9 +96,9 @@ class TrialJobs {
// Do nothing // Do nothing
break; break;
case 'RUNNING': case 'RUNNING':
const oldTrialJobDetail = this.trialJobs.get(trialJobId); const oldTrialJobDetail: TrialJobDetail | undefined = this.trialJobs.get(trialJobId);
assert(oldTrialJobDetail); assert(oldTrialJobDetail);
if (oldTrialJobDetail && oldTrialJobDetail.status === "WAITING") { if (oldTrialJobDetail !== undefined && oldTrialJobDetail.status === "WAITING") {
this.trialJobs.set(trialJobId, trialJobDetail); this.trialJobs.set(trialJobId, trialJobDetail);
this.eventEmitter.emit('all', trialJobDetail.status, trialJobDetail); this.eventEmitter.emit('all', trialJobDetail.status, trialJobDetail);
} }
...@@ -112,8 +115,8 @@ class TrialJobs { ...@@ -112,8 +115,8 @@ class TrialJobs {
} }
public async run(): Promise<void> { public async run(): Promise<void> {
const startTime: Date = new Date(); const startTime: number = Date.now();
while ((Date.now() - startTime.getTime()) / 1000 + this.pastExecDuration < this.maxExecDuration) { while ((Date.now() - startTime) / 1000 + this.pastExecDuration < this.maxExecDuration) {
if (this.stopLoop || if (this.stopLoop ||
(this.noMoreTrials && this.trialJobs.size === 0)) { (this.noMoreTrials && this.trialJobs.size === 0)) {
break; break;
......
...@@ -31,7 +31,7 @@ import { parseArg, uniqueString, mkDirP, getLogDir } from './common/utils'; ...@@ -31,7 +31,7 @@ import { parseArg, uniqueString, mkDirP, getLogDir } from './common/utils';
import { NNIDataStore } from './core/nniDataStore'; import { NNIDataStore } from './core/nniDataStore';
import { NNIManager } from './core/nnimanager'; import { NNIManager } from './core/nnimanager';
import { SqlDB } from './core/sqlDatabase'; import { SqlDB } from './core/sqlDatabase';
import { RestServer } from './rest_server/server'; import { NNIRestServer } from './rest_server/nniRestServer';
import { LocalTrainingServiceForGPU } from './training_service/local/localTrainingServiceForGPU'; import { LocalTrainingServiceForGPU } from './training_service/local/localTrainingServiceForGPU';
import { import {
RemoteMachineTrainingService RemoteMachineTrainingService
...@@ -64,7 +64,7 @@ function usage(): void { ...@@ -64,7 +64,7 @@ function usage(): void {
console.info('usage: node main.js --port <port> --mode <local/remote> --start_mode <new/resume> --experiment_id <id>'); console.info('usage: node main.js --port <port> --mode <local/remote> --start_mode <new/resume> --experiment_id <id>');
} }
let port: number = RestServer.DEFAULT_PORT; let port: number = NNIRestServer.DEFAULT_PORT;
const strPort: string = parseArg(['--port', '-p']); const strPort: string = parseArg(['--port', '-p']);
if (strPort && strPort.length > 0) { if (strPort && strPort.length > 0) {
port = parseInt(strPort, 10); port = parseInt(strPort, 10);
...@@ -94,7 +94,7 @@ mkDirP(getLogDir()).then(async () => { ...@@ -94,7 +94,7 @@ mkDirP(getLogDir()).then(async () => {
const log: Logger = getLogger(); const log: Logger = getLogger();
try { try {
await initContainer(mode); await initContainer(mode);
const restServer: RestServer = component.get(RestServer); const restServer: NNIRestServer = component.get(NNIRestServer);
await restServer.start(port); await restServer.start(port);
log.info(`Rest server listening on: ${restServer.endPoint}`); log.info(`Rest server listening on: ${restServer.endPoint}`);
} catch (err) { } catch (err) {
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
"chai-as-promised": "^7.1.1", "chai-as-promised": "^7.1.1",
"child-process-promise": "^2.2.1", "child-process-promise": "^2.2.1",
"express": "^4.16.3", "express": "^4.16.3",
"express-joi-validator": "^2.0.0",
"node-nvidia-smi": "^1.0.0", "node-nvidia-smi": "^1.0.0",
"rx": "^4.1.0", "rx": "^4.1.0",
"sqlite3": "^4.0.2", "sqlite3": "^4.0.2",
......
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as bodyParser from 'body-parser';
import * as component from '../common/component';
import { RestServer } from '../common/restServer'
import { createRestHandler } from './restHandler';
/**
* NNI Main rest server, provides rest API to support
* # nnictl CLI tool
* # NNI Web UI
*
*/
@component.Singleton
export class NNIRestServer extends RestServer{
/** NNI main rest service default port */
public static readonly DEFAULT_PORT: number = 51188;
private readonly API_ROOT_URL: string = '/api/v1/nni';
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
constructor() {
super();
this.port = NNIRestServer.DEFAULT_PORT;
}
/**
* NNIRestServer's own router registration
*/
protected registerRestHandler(): void {
this.app.use(bodyParser.json());
this.app.use(this.API_ROOT_URL, createRestHandler(this));
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment