Commit 252f36f8 authored by Deshui Yu's avatar Deshui Yu
Browse files

NNI dogfood version 1

parent 781cea26
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
_in_file = open(3, 'rb')
_out_file = open(4, 'wb')
def send(command, data):
command = command.encode('utf8')
data = data.encode('utf8')
msg = b'%b%06d%b' % (command, len(data), data)
_out_file.write(msg)
_out_file.flush()
def receive():
header = _in_file.read(8)
l = int(header[2:])
command = header[:2].decode('utf8')
data = _in_file.read(l).decode('utf8')
return command, data
print(receive())
send('KI', '')
print(receive())
send('KI', 'hello')
send('KI', '世界')
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { expect } from 'chai';
import { Container, Scope } from 'typescript-ioc';
import * as component from '../../common/component';
import { Database, DataStore, TrialJobInfo } from '../../common/datastore';
import { setExperimentStartupInfo } from '../../common/experimentStartupInfo';
import { ExperimentProfile, TrialJobStatistics } from '../../common/manager';
import { TrialJobStatus } from '../../common/trainingService';
import { cleanupUnitTest, prepareUnitTest } from '../../common/utils';
import { NNIDataStore } from '../nniDataStore';
import { SqlDB } from '../sqlDatabase';
describe('Unit test for dataStore', () => {
let ds: DataStore;
before(async () => {
prepareUnitTest();
Container.bind(Database).to(SqlDB).scope(Scope.Singleton);
Container.bind(DataStore).to(NNIDataStore).scope(Scope.Singleton);
ds = component.get(DataStore);
await ds.init();
});
after(() => {
cleanupUnitTest();
});
it('test emtpy experiment profile', async () => {
const result: ExperimentProfile = await ds.getExperimentProfile('abc');
expect(result).to.equal(undefined, 'Should not get any profile');
});
it('test experiment profiles CRUD', async () => {
const profile: ExperimentProfile = {
params: {
authorName: 'test1',
experimentName: 'exp1',
trialConcurrency: 2,
maxExecDuration: 10,
maxTrialNum: 5,
searchSpace: `{
"dropout_rate": {
"_type": "uniform",
"_value": [0.1, 0.5]
},
"batch_size": {
"_type": "choice",
"_value": [50, 250, 500]
}
}`,
tuner: {
tunerCommand: 'python3 tunner.py',
tunerCwd: '/tmp',
tunerCheckpointDirectory: '/tmp/cp',
tunerGpuNum: 0
}
},
id: 'exp123',
execDuration: 0,
startTime: new Date(),
endTime: new Date(),
revision: 0
}
const id: string = profile.id;
for (let i: number = 0; i < 5; i++) {
await ds.storeExperimentProfile(profile);
profile.revision += 1;
}
const result: ExperimentProfile = await ds.getExperimentProfile(id);
expect(result.revision).to.equal(4);
});
const testEventRecords: {
event: string;
jobId: string;
data?: string;
}[] = [
{
event: 'WAITING',
jobId: '111'
},
{
event: 'WAITING',
jobId: '222'
},
{
event: 'RUNNING',
jobId: '111'
},
{
event: 'RUNNING',
jobId: '222'
},
{
event: 'SUCCEEDED',
jobId: '111',
data: 'lr: 0.001'
},
{
event: 'FAILED',
jobId: '222'
}
];
// tslint:disable-next-line:no-any
const metricsData: any = [
{
trial_job_id: '111',
parameter_id: 'abc',
type: 'PERIODICAL',
value: 'acc: 0.88',
timestamp: new Date()
},
{
trial_job_id: '111',
parameter_id: 'abc',
type: 'FINAL',
value: 'acc: 0.88',
timestamp: new Date()
}
];
it('test trial job events store /query', async () => {
for (const event of testEventRecords) {
await ds.storeTrialJobEvent(<TrialJobStatus>event.event, event.jobId, event.data);
}
for (const metrics of metricsData) {
await ds.storeMetricData(metrics.trial_job_id, JSON.stringify(metrics));
}
const jobs: TrialJobInfo[] = await ds.listTrialJobs();
expect(jobs.length).to.equals(2, 'There should be 2 jobs');
const statistics: TrialJobStatistics[] = await ds.getTrialJobStatistics();
expect(statistics.length).to.equals(2, 'There should be 2 statistics');
});
});
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from nni.assessor import Assessor, AssessResult
class DummyAssessor(Assessor):
def assess_trial(self, trial_job_id, trial_history):
return AssessResult.Good
DummyAssessor().run()
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from nni.tuner import Tuner
class TestTuner(Tuner):
def generate_parameters(self, trial_id):
return {'lr':0.01}
def receive_trial_result(self, parameter_id, parameters, reward):
pass
def update_search_space(self, search_space):
return True
TestTuner().run()
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as assert from 'assert';
import { ChildProcess, spawn } from 'child_process';
import { Deferred } from 'ts-deferred';
import { cleanupUnitTest, prepareUnitTest } from '../../common/utils';
import * as CommandType from '../commands';
import { createAssessorInterface, createTunerInterface, IpcInterface } from '../ipcInterface';
let sentCommands: {[key: string]: string}[] = [];
const receivedCommands: {[key: string]: string}[] = [];
let commandTooLong: Error | undefined;
let rejectCommandType: Error | undefined;
function runProcess(): Promise<Error | null> {
// the process is intended to throw error, do not reject
const deferred: Deferred<Error | null> = new Deferred<Error | null>();
// create fake assessor process
const stdio: {}[] = ['ignore', 'pipe', process.stderr, 'pipe', 'pipe'];
const proc: ChildProcess = spawn('python3 assessor.py', [], { stdio, cwd: 'core/test', shell: true });
// record its sent/received commands on exit
proc.on('error', (error: Error): void => { deferred.resolve(error); });
proc.on('exit', (code: number): void => {
if (code !== 0) {
deferred.resolve(new Error(`return code: ${code}`));
} else {
sentCommands = proc.stdout.read().toString().split('\n');
deferred.resolve(null);
}
});
// create IPC interface
const assessor: IpcInterface = createAssessorInterface(proc);
assessor.onCommand((commandType: string, content: string): void => {
receivedCommands.push({ commandType, content });
});
// Command #1: ok
assessor.sendCommand('IN');
// Command #2: ok
assessor.sendCommand('ME', '123');
// Command #3: too long
try {
assessor.sendCommand('ME', 'x'.repeat(1_000_000));
} catch (error) {
commandTooLong = error;
}
// Command #4: not assessor command
try {
assessor.sendCommand('GE', '1');
} catch (error) {
rejectCommandType = error;
}
return deferred.promise;
}
describe('core/protocol', (): void => {
before(async () => {
prepareUnitTest();
await runProcess();
});
after(() => {
cleanupUnitTest();
});
it('should have sent 2 successful commands', (): void => {
assert.equal(sentCommands.length, 3);
assert.equal(sentCommands[2], '');
});
it('sendCommand() should work without content', (): void => {
assert.equal(sentCommands[0], '(\'IN\', \'\')');
});
it('sendCommand() should work with content', (): void => {
assert.equal(sentCommands[1], '(\'ME\', \'123\')');
});
it('sendCommand() should throw on too long command', (): void => {
assert.equal((<Error>commandTooLong).name, 'RangeError');
assert.equal((<Error>commandTooLong).message, 'Command too long');
});
it('sendCommand() should throw on wrong command type', (): void => {
assert.equal((<Error>rejectCommandType).name, 'AssertionError [ERR_ASSERTION]');
});
it('should have received 3 commands', (): void => {
assert.equal(receivedCommands.length, 3);
});
it('onCommand() should work without content', (): void => {
assert.deepStrictEqual(receivedCommands[0], {
commandType: 'KI',
content: ''
});
});
it('onCommand() should work with content', (): void => {
assert.deepStrictEqual(receivedCommands[1], {
commandType: 'KI',
content: 'hello'
});
});
it('onCommand() should work with Unicode content', (): void => {
assert.deepStrictEqual(receivedCommands[2], {
commandType: 'KI',
content: '世界'
});
});
});
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as assert from 'assert';
import { ChildProcess, spawn } from 'child_process';
import { Deferred } from 'ts-deferred';
import { cleanupUnitTest, prepareUnitTest } from '../../common/utils';
import * as CommandType from '../commands';
import { createAssessorInterface, IpcInterface } from '../ipcInterface';
let assessor: IpcInterface | undefined;
let procExit: boolean = false;
let procError: boolean = false;
function startProcess(): void {
// create fake assessor process
const stdio: {}[] = ['ignore', 'pipe', process.stderr, 'pipe', 'pipe'];
const proc: ChildProcess = spawn('python3 dummy_assessor.py', [], { stdio, cwd: 'core/test', shell: true });
proc.on('error', (error: Error): void => {
procExit = true;
procError = true;
});
proc.on('exit', (code: number): void => {
procExit = true;
procError = (code !== 0);
});
// create IPC interface
assessor = createAssessorInterface(proc);
(<IpcInterface>assessor).onCommand((commandType: string, content: string): void => {
console.log(commandType, content); // tslint:disable-line:no-console
});
}
describe('core/ipcInterface.terminate', (): void => {
before(() => {
prepareUnitTest();
startProcess();
});
after(() => {
cleanupUnitTest();
});
it('normal', () => {
(<IpcInterface>assessor).sendCommand(
CommandType.REPORT_METRIC_DATA,
'{"trial_job_id":"A","type":"periodical","value":1}');
const deferred: Deferred<void> = new Deferred<void>();
setTimeout(
() => {
assert.ok(!procExit);
assert.ok(!procError);
deferred.resolve();
},
1000);
return deferred.promise;
});
it('terminate', () => {
(<IpcInterface>assessor).sendCommand(CommandType.TERMINATE);
const deferred: Deferred<void> = new Deferred<void>();
setTimeout(
() => {
assert.ok(procExit);
assert.ok(!procError);
deferred.resolve();
},
1000);
return deferred.promise;
});
});
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { assert } from 'console';
import * as fs from 'fs';
import { Deferred } from 'ts-deferred';
import { DataStore, MetricData, MetricDataRecord, MetricType,
TrialJobEvent, TrialJobEventRecord, TrialJobInfo } from '../../common/datastore';
import { ExperimentProfile, TrialJobStatistics } from '../../common/manager';
import { TrialJobStatus } from '../../common/trainingService';
class SimpleDb {
private name: string = '';
private fileName: string = '';
private db: Array<any> = new Array();
private map: Map<string, number> = new Map<string, number>(); // map key to data index
constructor (name: string, filename: string) {
this.name = name;
this.fileName = filename;
}
async saveData(data: any, key?: string): Promise<void> {
let index;
if (key && this.map.has(key)) {
index = this.map.get(key);
}
if (index === undefined) {
index = this.db.push(data) - 1;
} else {
this.db[index] = data;
}
if (key) {
this.map.set(key, index);
}
await this.persist();
}
listAllData(): Promise<Array<any>> {
const deferred = new Deferred<Array<any>>();
deferred.resolve(this.db);
return deferred.promise;
}
getData(key: string): Promise<any> {
const deferred = new Deferred<any>();
if (this.map.has(key)) {
const index = this.map.get(key);
if(index !== undefined && index >= 0) {
deferred.resolve(this.db[index]);
} else {
deferred.reject(new Error(`Key or index not found: ${this.name}, ${key}`));
}
} else {
console.log(`Key not found: ${this.name}, ${key}`);
deferred.resolve(undefined);
}
return deferred.promise;
}
persist(): Promise<void> {
const deferred = new Deferred<void>();
fs.writeFileSync(this.fileName, JSON.stringify({
name: this.name,
data: this.db,
index: JSON.stringify([...this.map])
}, null, 4));
deferred.resolve();
return deferred.promise;
}
}
class MockedDataStore implements DataStore {
private dbExpProfile: SimpleDb = new SimpleDb('exp_profile', './exp_profile.json');
private dbTrialJobs: SimpleDb = new SimpleDb('trial_jobs', './trial_jobs.json');
private dbMetrics: SimpleDb = new SimpleDb('metrics', './metrics.json');
init(): Promise<void> {
return Promise.resolve();
}
close(): Promise<void> {
return Promise.resolve();
}
async storeExperimentProfile(experimentProfile: ExperimentProfile): Promise<void> {
await this.dbExpProfile.saveData(experimentProfile, experimentProfile.id);
}
async getExperimentProfile(experimentId: string): Promise<ExperimentProfile> {
return await this.dbExpProfile.getData(experimentId);
}
async storeTrialJobEvent(event: TrialJobEvent, trialJobId: string, data?: string | undefined): Promise<void> {
const dataRecord: TrialJobEventRecord = {
event: event,
timestamp: new Date(),
trialJobId: trialJobId,
data: data
}
await this.dbTrialJobs.saveData(dataRecord);
}
async getTrialJobStatistics(): Promise<any[]> {
const result: TrialJobStatistics[] = [];
const jobs = await this.listTrialJobs();
const map: Map<TrialJobStatus, number> = new Map();
jobs.forEach((value) => {
let n: number|undefined = map.get(value.status);
if (!n) {
n = 0;
}
map.set(value.status, n + 1);
})
map.forEach((value, key) => {
const statistics: TrialJobStatistics = {
trialJobStatus: key,
trialJobNumber: value
}
result.push(statistics);
})
return result;
}
async listTrialJobs(status?: TrialJobStatus): Promise<TrialJobInfo[]> {
const trialJobEvents: TrialJobEventRecord[] = await this.dbTrialJobs.listAllData();
const map: Map<string, TrialJobInfo> = this.getTrialJobsByReplayEvents(trialJobEvents);
const result: TrialJobInfo[]= [];
for (let key of map.keys()) {
const jobInfo = map.get(key);
if (jobInfo === undefined) {
continue;
}
if (!(status && jobInfo.status !== status)) {
if (jobInfo.status === 'SUCCEEDED') {
jobInfo.finalMetricData = await this.getFinalMetricData(jobInfo.id);
}
result.push(jobInfo);
}
}
return result;
}
async storeMetricData(trialJobId: string, data: string): Promise<void> {
const metrics = JSON.parse(data) as MetricData;
assert(trialJobId === metrics.trial_job_id);
await this.dbMetrics.saveData({
trialJobId: metrics.trial_job_id,
parameterId: metrics.parameter_id,
type: metrics.type,
data: metrics.value,
timestamp: new Date()
});
}
async getMetricData(trialJobId: string, metricType: MetricType): Promise<MetricDataRecord[]> {
const result: MetricDataRecord[] = []
const allMetrics = await this.dbMetrics.listAllData();
allMetrics.forEach((value) => {
const metrics = <MetricDataRecord>value;
if (metrics.type === metricType && metrics.trialJobId === trialJobId) {
result.push(metrics);
}
});
return result;
}
public getTrialJob(trialJobId: string): Promise<TrialJobInfo> {
throw new Error("Method not implemented.");
}
private async getFinalMetricData(trialJobId: string): Promise<any> {
const metrics: MetricDataRecord[] = await this.getMetricData(trialJobId, "FINAL");
assert(metrics.length <= 1);
if (metrics.length == 1) {
return metrics[0];
} else {
return undefined;
}
}
private getJobStatusByLatestEvent(event: TrialJobEvent): TrialJobStatus {
switch(event) {
case 'USER_TO_CANCEL':
return 'USER_CANCELED';
case 'ADD_CUSTOMIZED':
return 'WAITING';
}
return <TrialJobStatus>event;
}
private getTrialJobsByReplayEvents(trialJobEvents: TrialJobEventRecord[]): Map<string, TrialJobInfo> {
const map: Map<string, TrialJobInfo> = new Map();
// assume data is stored by time ASC order
for (let record of trialJobEvents) {
let jobInfo: TrialJobInfo | undefined;
if (map.has(record.trialJobId)) {
jobInfo = map.get(record.trialJobId);
} else {
jobInfo = {
id: record.trialJobId,
status: this.getJobStatusByLatestEvent(record.event),
};
}
if (!jobInfo) {
throw new Error('Empty JobInfo');
}
switch (record.event) {
case 'RUNNING':
jobInfo.startTime = new Date();
break;
case 'SUCCEEDED':
case 'FAILED':
case 'USER_CANCELED':
case 'SYS_CANCELED':
jobInfo.endTime = new Date();
}
jobInfo.status = this.getJobStatusByLatestEvent(record.event);
map.set(record.trialJobId, jobInfo);
}
return map;
}
}
export { MockedDataStore };
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
* MIT License
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { Deferred } from 'ts-deferred';
import { Provider } from 'typescript-ioc';
import { MethodNotImplementedError } from '../../common/errors';
import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService';
const testTrainingServiceProvider: Provider = {
get: () => { return new MockedTrainingService(); }
};
class MockedTrainingService extends TrainingService {
public mockedMetaDataValue: string = "default";
public jobDetail1: TrialJobDetail = {
id: '1234',
status: 'SUCCEEDED',
submitTime: new Date(),
startTime: new Date(),
endTime: new Date(),
tags: ['test'],
url: 'http://test',
workingDirectory: '/tmp/mocked',
form: {
jobType: 'TRIAL'
}
};
public jobDetail2: TrialJobDetail = {
id: '3456',
status: 'SUCCEEDED',
submitTime: new Date(),
startTime: new Date(),
endTime: new Date(),
tags: ['test'],
url: 'http://test',
workingDirectory: '/tmp/mocked',
form: {
jobType: 'TRIAL'
}
};
public listTrialJobs(): Promise<TrialJobDetail[]> {
const deferred = new Deferred<TrialJobDetail[]>();
deferred.resolve([this.jobDetail1, this.jobDetail2]);
return deferred.promise;
}
public getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
const deferred = new Deferred<TrialJobDetail>();
if(trialJobId === '1234'){
deferred.resolve(this.jobDetail1);
}else if(trialJobId === '3456'){
deferred.resolve(this.jobDetail2);
}else{
deferred.reject();
}
return deferred.promise;
}
async run(): Promise<void> {
}
public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
}
public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
}
public submitTrialJob(form: TrialJobApplicationForm): Promise<TrialJobDetail> {
const deferred = new Deferred<TrialJobDetail>();
return deferred.promise;
}
public cancelTrialJob(trialJobId: string): Promise<void> {
const deferred = new Deferred<void>();
if(trialJobId === '1234' || trialJobId === '3456'){
deferred.resolve();
}else{
deferred.reject('job id error');
}
return deferred.promise;
}
public setClusterMetadata(key: string, value: string): Promise<void> {
const deferred = new Deferred<void>();
if(key == 'mockedMetadataKey'){
this.mockedMetaDataValue = value;
deferred.resolve();
}else{
deferred.reject('key error');
}
return deferred.promise;
}
public getClusterMetadata(key: string): Promise<string> {
const deferred = new Deferred<string>();
if(key == 'mockedMetadataKey'){
deferred.resolve(this.mockedMetaDataValue);
}else{
deferred.reject('key error');
}
return deferred.promise;
}
public cleanUp(): Promise<void> {
throw new MethodNotImplementedError();
}
}
export{MockedTrainingService, testTrainingServiceProvider}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { assert, expect } from 'chai';
import { Container, Scope } from 'typescript-ioc';
import * as component from '../../common/component';
import { Database, DataStore } from '../../common/datastore';
import { Manager } from '../../common/manager';
import { TrainingService } from '../../common/trainingService';
import { cleanupUnitTest, prepareUnitTest } from '../../common/utils';
import { NNIDataStore } from '../nniDataStore';
import { NNIManager } from '../nnimanager';
import { SqlDB } from '../sqlDatabase';
import { MockedTrainingService } from './mockedTrainingService';
async function initContainer(): Promise<void> {
prepareUnitTest();
Container.bind(TrainingService).to(MockedTrainingService).scope(Scope.Singleton);
Container.bind(Manager).to(NNIManager).scope(Scope.Singleton);
Container.bind(Database).to(SqlDB).scope(Scope.Singleton);
Container.bind(DataStore).to(NNIDataStore).scope(Scope.Singleton);
await component.get<DataStore>(DataStore).init();
}
describe('Unit test for nnimanager', function () {
this.timeout(10000);
let nniManager: Manager;
let ClusterMetadataKey = 'mockedMetadataKey';
let experimentParams = {
authorName: 'zql',
experimentName: 'naive_experiment',
trialConcurrency: 2,
maxExecDuration: 5,
maxTrialNum: 2,
searchSpace: '{"x":1}',
tuner: {
tunerCommand: 'python3 hyperopt.py',
tunerCwd: 'core/test',
tunerCheckpointDirectory: '',
tunerGpuNum: 1
},
assessor: {
assessorCommand: 'python3 dummy_assessor.py',
assessorCwd: 'core/test',
assessorCheckpointDirectory: '',
assessorGpuNum: 1
}
}
before(async () => {
await initContainer();
nniManager = component.get(Manager);
});
after(() => {
cleanupUnitTest();
})
it('test resumeExperiment', () => {
//TODO: add resume experiment unit test
})
it('test startExperiment', () => {
return nniManager.startExperiment(experimentParams).then(function (experimentId) {
expect(experimentId.length).to.be.equal(8);
}).catch(function (error) {
assert.fail(error);
})
})
it('test listTrialJobs', () => {
//FIXME: not implemented
//return nniManager.listTrialJobs().then(function (trialJobDetails) {
// expect(trialJobDetails.length).to.be.equal(2);
//}).catch(function (error) {
// assert.fail(error);
//})
})
it('test getTrialJob valid', () => {
//query a exist id
return nniManager.getTrialJob('1234').then(function (trialJobDetail) {
expect(trialJobDetail.id).to.be.equal('1234');
}).catch(function (error) {
assert.fail(error);
})
})
it('test getTrialJob with invalid id', () => {
//query a not exist id, and the function should throw error, and should not process then() method
return nniManager.getTrialJob('4567').then((jobid) => {
assert.fail();
}).catch((error) => {
assert.isTrue(true);
})
})
it('test getClusterMetadata', () => {
//default value is "default"
return nniManager.getClusterMetadata(ClusterMetadataKey).then(function (value) {
expect(value).to.equal("default");
});
})
it('test setClusterMetadata and getClusterMetadata', () => {
//set a valid key
return nniManager.setClusterMetadata(ClusterMetadataKey, "newdata").then(() => {
return nniManager.getClusterMetadata(ClusterMetadataKey).then(function (value) {
expect(value).to.equal("newdata");
});
}).catch((error) => {
console.log(error);
})
})
//TODO: complete ut
it('test cancelTrialJobByUser', () => {
return nniManager.cancelTrialJobByUser('1234').then(() => {
}).catch((error) => {
assert.fail(error);
})
})
it('test addCustomizedTrialJob', () => {
return nniManager.addCustomizedTrialJob('hyperParams').then(() => {
}).catch((error) => {
assert.fail(error);
})
})
})
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as assert from 'assert';
import * as os from 'os';
import * as path from 'path';
import { Container } from 'typescript-ioc';
import * as component from '../../common/component';
import { Database, MetricDataRecord, TrialJobEvent, TrialJobEventRecord } from '../../common/datastore';
import { setExperimentStartupInfo } from '../../common/experimentStartupInfo';
import { ExperimentParams, ExperimentProfile } from '../../common/manager';
import { cleanupUnitTest, getDefaultDatabaseDir, mkDirP, prepareUnitTest } from '../../common/utils';
import { SqlDB } from '../sqlDatabase';
const expParams1: ExperimentParams = {
authorName: 'ZhangSan',
experimentName: 'Exp1',
trialConcurrency: 3,
maxExecDuration: 100,
maxTrialNum: 5,
searchSpace: 'SS',
tuner: {
tunerCommand: './tuner.sh',
tunerCwd: '.',
tunerCheckpointDirectory: '/tmp',
tunerGpuNum: 0
}
};
const expParams2: ExperimentParams = {
authorName: 'LiSi',
experimentName: 'Exp2',
trialConcurrency: 5,
maxExecDuration: 1000,
maxTrialNum: 5,
searchSpace: '',
tuner: {
tunerCommand: 'python tuner.py',
tunerCwd: '/tmp',
tunerCheckpointDirectory: '/tmp'
},
assessor: {
assessorCommand: 'python assessor.py',
assessorCwd: '/tmp',
assessorCheckpointDirectory: '/tmp'
}
};
const profiles: ExperimentProfile[] = [
{ params: expParams1, id: '#1', execDuration: 0, startTime: new Date(), endTime: undefined, revision: 1 },
{ params: expParams1, id: '#1', execDuration: 0, startTime: new Date(), endTime: new Date(), revision: 2 },
{ params: expParams2, id: '#2', execDuration: 0, startTime: new Date(), endTime: new Date(), revision: 2 },
{ params: expParams2, id: '#2', execDuration: 0, startTime: new Date(), endTime: new Date(), revision: 3 }
];
const events: TrialJobEventRecord[] = [
{ timestamp: new Date(), event: 'WAITING', trialJobId: 'A', data: 'hello' }, // 0
{ timestamp: new Date(), event: 'UNKNOWN', trialJobId: 'B', data: 'world' }, // 1
{ timestamp: new Date(), event: 'RUNNING', trialJobId: 'B', data: undefined }, // 2
{ timestamp: new Date(), event: 'RUNNING', trialJobId: 'A', data: '123' }, // 3
{ timestamp: new Date(), event: 'FAILED', trialJobId: 'A', data: undefined } // 4
];
const metrics: MetricDataRecord[] = [
{ timestamp: new Date(), trialJobId: 'A', parameterId: '1', type: 'PERIODICAL', sequence: 0, data: 1.1 }, // 0
{ timestamp: new Date(), trialJobId: 'B', parameterId: '2', type: 'PERIODICAL', sequence: 0, data: 2.1 }, // 1
{ timestamp: new Date(), trialJobId: 'A', parameterId: '1', type: 'PERIODICAL', sequence: 1, data: 1.2 }, // 2
{ timestamp: new Date(), trialJobId: 'A', parameterId: '1', type: 'FINAL', sequence: 0, data: 1.3 }, // 3
{ timestamp: new Date(), trialJobId: 'C', parameterId: '2', type: 'PERIODICAL', sequence: 1, data: 2.1 }, // 4
{ timestamp: new Date(), trialJobId: 'C', parameterId: '2', type: 'FINAL', sequence: 0, data: 2.2 } // 5
];
// tslint:disable-next-line:no-any
function assertRecordEqual(record: any, value: any): void {
assert.ok(record.timestamp > new Date(2018, 6, 1));
assert.ok(record.timestamp < new Date());
for (const key in value) { // tslint:disable-line:no-for-in
if (key !== 'timestamp') {
assert.equal(record[key], value[key]);
}
}
}
// tslint:disable-next-line:no-any
function assertRecordsEqual(records: any[], inputs: any[], indices: number[]): void {
assert.equal(records.length, indices.length);
for (let i: number = 0; i < records.length; i++) {
assertRecordEqual(records[i], inputs[indices[i]]);
}
}
describe('core/sqlDatabase', () => {
let db: SqlDB | undefined;
before(async () => {
prepareUnitTest();
const dbDir: string = getDefaultDatabaseDir();
await mkDirP(dbDir);
db = new SqlDB();
await (<SqlDB>db).init(true, dbDir);
for (const profile of profiles) {
await (<SqlDB>db).storeExperimentProfile(profile);
}
for (const event of events) {
await (<SqlDB>db).storeTrialJobEvent(<TrialJobEvent>event.event, event.trialJobId, event.data);
}
for (const metric of metrics) {
await (<SqlDB>db).storeMetricData(metric.trialJobId, JSON.stringify(metric));
}
});
after(() => {
cleanupUnitTest();
});
it('queryExperimentProfile without revision', async () => {
const records: ExperimentProfile[] = await (<SqlDB>db).queryExperimentProfile('#1');
assert.equal(records.length, 2);
assert.deepEqual(records[0], profiles[1]);
assert.deepEqual(records[1], profiles[0]);
});
it('queryExperimentProfile with revision', async () => {
const records: ExperimentProfile[] = await (<SqlDB>db).queryExperimentProfile('#1', 2);
assert.equal(records.length, 1);
assert.deepEqual(records[0], profiles[1]);
});
it('queryLatestExperimentProfile', async () => {
const record: ExperimentProfile = await (<SqlDB>db).queryLatestExperimentProfile('#2');
assert.deepEqual(record, profiles[3]);
});
it('queryTrialJobEventByEvent without trialJobId', async () => {
const records: TrialJobEventRecord[] = await (<SqlDB>db).queryTrialJobEvent(undefined, 'RUNNING');
assertRecordsEqual(records, events, [2, 3]);
});
it('queryTrialJobEventByEvent with trialJobId', async () => {
const records: TrialJobEventRecord[] = await (<SqlDB>db).queryTrialJobEvent('A', 'RUNNING');
assertRecordsEqual(records, events, [3]);
});
it('queryTrialJobEventById', async () => {
const records: TrialJobEventRecord[] = await (<SqlDB>db).queryTrialJobEvent('B');
assertRecordsEqual(records, events, [1, 2]);
});
it('queryMetricDataByType without trialJobId', async () => {
const records: MetricDataRecord[] = await (<SqlDB>db).queryMetricData(undefined, 'FINAL');
assertRecordsEqual(records, metrics, [3, 5]);
});
it('queryMetricDataByType with trialJobId', async () => {
const records: MetricDataRecord[] = await (<SqlDB>db).queryMetricData('A', 'PERIODICAL');
assertRecordsEqual(records, metrics, [0, 2]);
});
it('queryMetricDataById', async () => {
const records: MetricDataRecord[] = await (<SqlDB>db).queryMetricData('B');
assertRecordsEqual(records, metrics, [1]);
});
it('empty result', async () => {
const records: MetricDataRecord[] = await (<SqlDB>db).queryMetricData('X');
assert.equal(records.length, 0);
});
});
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as assert from 'assert';
import { EventEmitter } from 'events';
import { TrainingService, TrialJobDetail, TrialJobStatus } from '../common/trainingService';
import { delay } from '../common/utils';
type TrialJobMaintainerEvent = TrialJobStatus | 'EXPERIMENT_DONE';
class TrialJobs {
private eventEmitter: EventEmitter;
private trialJobs: Map<string, TrialJobDetail>;
private noMoreTrials: boolean;
private stopLoop: boolean;
private trainingService: TrainingService;
private pastExecDuration: number; // second
private maxExecDuration: number; // second
constructor(
trainingService: TrainingService,
pastExecDuration: number, // second
maxExecDuration: number // second
) {
this.eventEmitter = new EventEmitter();
this.trialJobs = new Map<string, TrialJobDetail>();
this.noMoreTrials = false;
this.stopLoop = false;
this.trainingService = trainingService;
this.pastExecDuration = pastExecDuration;
this.maxExecDuration = maxExecDuration;
}
public setTrialJob(key: string, value: TrialJobDetail): void {
this.trialJobs.set(key, value);
}
public getTrialJob(key: string): TrialJobDetail | undefined {
return this.trialJobs.get(key);
}
public setNoMoreTrials(): void {
this.noMoreTrials = true;
}
public setStopLoop(): void {
this.stopLoop = true;
}
public updateMaxExecDuration(duration: number): void {
this.maxExecDuration = duration;
}
public on(listener: (event: TrialJobMaintainerEvent, trialJobDetail: TrialJobDetail) => void): void {
this.eventEmitter.addListener('all', listener);
}
public async requestTrialJobsStatus(): Promise<void> {
for (const trialJobId of Array.from(this.trialJobs.keys())) {
const trialJobDetail: TrialJobDetail = await this.trainingService.getTrialJob(trialJobId);
switch (trialJobDetail.status) {
case 'SUCCEEDED':
case 'USER_CANCELED':
this.eventEmitter.emit('all', trialJobDetail.status, trialJobDetail);
this.trialJobs.delete(trialJobId);
break;
case 'FAILED':
case 'SYS_CANCELED':
// In the current version, we do not retry
// TO DO: push this job to queue for retry
this.eventEmitter.emit('all', trialJobDetail.status, trialJobDetail);
this.trialJobs.delete(trialJobId);
break;
case 'WAITING':
// Do nothing
break;
case 'RUNNING':
const oldTrialJobDetail = this.trialJobs.get(trialJobId);
assert(oldTrialJobDetail);
if (oldTrialJobDetail && oldTrialJobDetail.status === "WAITING") {
this.trialJobs.set(trialJobId, trialJobDetail);
this.eventEmitter.emit('all', trialJobDetail.status, trialJobDetail);
}
break;
case 'UNKNOWN':
// Do nothing
break;
default:
// TO DO: add warning in log
}
}
return Promise.resolve();
}
public async run(): Promise<void> {
const startTime: Date = new Date();
while ((Date.now() - startTime.getTime()) / 1000 + this.pastExecDuration < this.maxExecDuration) {
if (this.stopLoop ||
(this.noMoreTrials && this.trialJobs.size === 0)) {
break;
}
await this.requestTrialJobsStatus();
await delay(5000);
}
this.eventEmitter.emit('all', 'EXPERIMENT_DONE');
}
}
export { TrialJobs, TrialJobMaintainerEvent };
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { Container, Scope } from 'typescript-ioc';
import * as component from './common/component';
import { Database, DataStore } from './common/datastore';
import { setExperimentStartupInfo } from './common/experimentStartupInfo';
import { getLogger, Logger } from './common/log';
import { Manager } from './common/manager';
import { TrainingService } from './common/trainingService';
import { parseArg, uniqueString, mkDirP, getLogDir } from './common/utils';
import { NNIDataStore } from './core/nniDataStore';
import { NNIManager } from './core/nnimanager';
import { SqlDB } from './core/sqlDatabase';
import { RestServer } from './rest_server/server';
import { LocalTrainingService } from './training_service/local/localTrainingService';
import {
RemoteMachineTrainingService
} from './training_service/remote_machine/remoteMachineTrainingService';
function initStartupInfo(startExpMode: string, resumeExperimentId: string) {
const createNew: boolean = (startExpMode === 'new');
const expId: string = createNew ? uniqueString(8) : resumeExperimentId;
setExperimentStartupInfo(createNew, expId);
}
async function initContainer(platformMode: string): Promise<void> {
if (platformMode === 'local') {
Container.bind(TrainingService).to(LocalTrainingService).scope(Scope.Singleton);
} else if (platformMode === 'remote') {
Container.bind(TrainingService).to(RemoteMachineTrainingService).scope(Scope.Singleton);
} else {
throw new Error(`Error: unsupported mode: ${mode}`);
}
Container.bind(Manager).to(NNIManager).scope(Scope.Singleton);
Container.bind(Database).to(SqlDB).scope(Scope.Singleton);
Container.bind(DataStore).to(NNIDataStore).scope(Scope.Singleton);
const ds: DataStore = component.get(DataStore);
await ds.init();
}
function usage(): void {
console.info('usage: node main.js --port <port> --mode <local/remote> --start_mode <new/resume> --experiment_id <id>');
}
let port: number = RestServer.DEFAULT_PORT;
const strPort: string = parseArg(['--port', '-p']);
if (strPort && strPort.length > 0) {
port = parseInt(strPort, 10);
}
const mode: string = parseArg(['--mode', '-m']);
if (!['local', 'remote'].includes(mode)) {
usage();
process.exit(1);
}
const startMode: string = parseArg(['--start_mode', '-s']);
if (!['new', 'resume'].includes(startMode)) {
usage();
process.exit(1);
}
const experimentId: string = parseArg(['--experiment_id', '-id']);
if (startMode === 'resume' && experimentId.trim().length < 1) {
usage();
process.exit(1);
}
initStartupInfo(startMode, experimentId);
mkDirP(getLogDir()).then(async () => {
const log: Logger = getLogger();
try {
await initContainer(mode);
const restServer: RestServer = component.get(RestServer);
await restServer.start(port);
log.info(`Rest server listening on: ${restServer.endPoint}`);
} catch (err) {
log.error(`${err.stack}`);
}
}).catch((err: Error) => {
console.error(`Failed to create log dir: ${err.stack}`);
});
{
"name": "nni",
"version": "1.0.0",
"main": "index.js",
"scripts": {
"postbuild": "cp -f --parent scripts/*.py ./dist/",
"build": "tsc",
"test": "mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --colors",
"start": "node dist/main.js"
},
"license": "MIT",
"dependencies": {
"chai-as-promised": "^7.1.1",
"child-process-promise": "^2.2.1",
"express": "^4.16.3",
"node-nvidia-smi": "^1.0.0",
"rx": "^4.1.0",
"serve": "^9.6.0",
"sqlite3": "^4.0.2",
"ssh2": "^0.6.1",
"stream-buffers": "^3.0.2",
"tail-stream": "^0.3.4",
"tree-kill": "^1.2.0",
"ts-deferred": "^1.0.4",
"typescript-ioc": "^1.2.4",
"typescript-string-operations": "^1.3.1"
},
"devDependencies": {
"@types/chai": "^4.1.4",
"@types/chai-as-promised": "^7.1.0",
"@types/express": "^4.16.0",
"@types/mocha": "^5.2.5",
"@types/node": "^10.5.5",
"@types/request": "^2.47.1",
"@types/rx": "^4.1.1",
"@types/sqlite3": "^3.1.3",
"@types/ssh2": "^0.5.35",
"@types/stream-buffers": "^3.0.2",
"@types/tmp": "^0.0.33",
"chai": "^4.1.2",
"mocha": "^5.2.0",
"request": "^2.87.0",
"tmp": "^0.0.33",
"ts-node": "^7.0.0",
"tslint": "^5.11.0",
"tslint-microsoft-contrib": "^5.1.0",
"typescript": "^3.0.1"
},
"engines": {
"node": ">=10.0.0"
}
}
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { Request, Response, Router } from 'express';
import * as path from 'path';
import * as component from '../common/component';
import { DataStore, MetricDataRecord, TrialJobInfo } from '../common/datastore';
import { NNIError, NNIErrorNames } from '../common/errors';
import { isNewExperiment } from '../common/experimentStartupInfo';
import { getLogger, Logger } from '../common/log';
import { ExperimentProfile, Manager, TrialJobStatistics} from '../common/manager';
import { RestServer } from './server';
import { TensorBoard } from './tensorboard';
class NNIRestHandler {
private restServer: RestServer;
private nniManager: Manager;
private tb: TensorBoard;
private log: Logger;
constructor(rs: RestServer) {
this.nniManager = component.get(Manager);
this.restServer = rs;
this.tb = new TensorBoard();
this.log = getLogger();
}
public createRestHandler(): Router {
const router: Router = Router();
// tslint:disable-next-line:typedef
router.use((req: Request, res: Response, next) => {
this.log.info(`${req.method}: ${req.url}: body:\n${JSON.stringify(req.body, undefined, 4)}`);
res.header('Access-Control-Allow-Origin', '*');
res.header('Access-Control-Allow-Headers', 'Origin, X-Requested-With, Content-Type, Accept');
res.header('Access-Control-Allow-Methods', 'PUT,POST,GET,DELETE,OPTIONS');
res.setHeader('Content-Type', 'application/json');
next();
});
this.checkStatus(router);
this.getExperimentProfile(router);
this.updateExperimentProfile(router);
this.startExperiment(router);
this.stopExperiment(router);
this.getTrialJobStatistics(router);
this.setClusterMetaData(router);
this.listTrialJobs(router);
this.getTrialJob(router);
this.addTrialJob(router);
this.cancelTrialJob(router);
this.getMetricData(router);
this.getExample(router);
this.getTriedParameters(router);
this.startTensorBoard(router);
this.stopTensorBoard(router);
return router;
}
private handle_error(err: Error, res: Response): void {
this.log.info(err);
if (err instanceof NNIError && err.name === NNIErrorNames.NOT_FOUND) {
res.status(404);
} else {
res.status(500);
}
res.send({
error: err.message
});
}
// TODO add validators for request params, query, body
private checkStatus(router: Router): void {
router.get('/check-status', (req: Request, res: Response) => {
const ds: DataStore = component.get<DataStore>(DataStore);
ds.init().then(() => {
res.send();
}).catch(async (err: Error) => {
this.handle_error(err, res);
this.log.error(err.message);
this.log.error(`Database initialize failed, stopping rest server...`);
await this.restServer.stop();
});
});
}
private getExperimentProfile(router: Router): void {
router.get('/experiment', (req: Request, res: Response) => {
this.nniManager.getExperimentProfile().then((profile: ExperimentProfile) => {
res.send(profile);
}).catch((err: Error) => {
this.handle_error(err, res);
});
});
}
private updateExperimentProfile(router: Router): void {
router.put('/experiment', (req: Request, res: Response) => {
this.nniManager.updateExperimentProfile(req.body, req.query.update_type).then(() => {
res.send();
}).catch((err: Error) => {
this.handle_error(err, res);
});
});
}
private startExperiment(router: Router): void {
router.post('/experiment', (req: Request, res: Response) => {
if (isNewExperiment()) {
this.nniManager.startExperiment(req.body).then((eid: string) => {
res.send({
experiment_id: eid
});
}).catch((err: Error) => {
this.handle_error(err, res);
});
} else {
this.nniManager.resumeExperiment().then(() => {
res.send();
}).catch((err: Error) => {
this.handle_error(err, res);
});
}
});
}
private stopExperiment(router: Router): void {
router.delete('/experiment', async (req: Request, res: Response) => {
try {
await this.tb.cleanUp();
await this.nniManager.stopExperiment();
res.send();
this.log.debug('Stopping rest server');
await this.restServer.stop();
} catch (err) {
this.handle_error(err, res);
}
});
}
private getTrialJobStatistics(router: Router): void {
router.get('/job-statistics', (req: Request, res: Response) => {
this.nniManager.getTrialJobStatistics().then((statistics: TrialJobStatistics[]) => {
res.send(statistics);
}).catch((err: Error) => {
this.handle_error(err, res);
});
});
}
private setClusterMetaData(router: Router): void {
router.put('/experiment/cluster-metadata', async (req: Request, res: Response) => {
// tslint:disable-next-line:no-any
const metadata: any = req.body;
const keys: string[] = Object.keys(metadata);
try {
for (const key of keys) {
await this.nniManager.setClusterMetadata(key, JSON.stringify(metadata[key]));
}
res.send();
} catch (err) {
this.handle_error(err, res);
}
});
}
private listTrialJobs(router: Router): void {
router.get('/trial-jobs', (req: Request, res: Response) => {
this.nniManager.listTrialJobs(req.query.status).then((jobInfos: TrialJobInfo[]) => {
jobInfos.forEach((trialJob: TrialJobInfo) => {
this.setErrorPathForFailedJob(trialJob);
});
res.send(jobInfos);
}).catch((err: Error) => {
this.handle_error(err, res);
});
});
}
private getTrialJob(router: Router): void {
router.get('/trial-jobs/:id', (req: Request, res: Response) => {
this.nniManager.getTrialJob(req.params.id).then((jobDetail: TrialJobInfo) => {
const jobInfo: TrialJobInfo = this.setErrorPathForFailedJob(jobDetail);
res.send(jobInfo);
}).catch((err: Error) => {
this.handle_error(err, res);
});
});
}
private addTrialJob(router: Router): void {
router.post('/trial-jobs', async (req: Request, res: Response) => {
this.nniManager.addCustomizedTrialJob(JSON.stringify(req.body)).then(() => {
res.send();
}).catch((err: Error) => {
this.handle_error(err, res);
});
});
}
private cancelTrialJob(router: Router): void {
router.delete('/trial-jobs/:id', async (req: Request, res: Response) => {
this.nniManager.cancelTrialJobByUser(req.params.id).then(() => {
res.send();
}).catch((err: Error) => {
this.handle_error(err, res);
});
});
}
private getMetricData(router: Router): void {
router.get('/metric-data/:job_id', async (req: Request, res: Response) => {
this.nniManager.getMetricData(req.params.job_id, req.query.type).then((metricsData: MetricDataRecord[]) => {
res.send(metricsData);
}).catch((err: Error) => {
this.handle_error(err, res);
});
});
}
private startTensorBoard(router: Router): void {
router.post('/tensorboard', async (req: Request, res: Response) => {
const jobIds: string[] = req.query.job_ids.split(',');
const tensorboardCmd: string | undefined = req.query.tensorboard_cmd;
this.tb.startTensorBoard(jobIds, tensorboardCmd).then((endPoint: string) => {
res.send({endPoint: endPoint});
}).catch((err: Error) => {
this.handle_error(err, res);
});
});
}
private stopTensorBoard(router: Router): void {
router.delete('/tensorboard', async (req: Request, res: Response) => {
const endPoint: string = req.query.endpoint;
this.tb.stopTensorBoard(endPoint).then(() => {
res.send();
}).catch((err: Error) => {
this.handle_error(err, res);
});
});
}
private getExample(router: Router): void {
// tslint:disable-next-line:no-empty
router.get('/example', async (req: Request, res: Response) => {
});
}
private getTriedParameters(router: Router): void {
// tslint:disable-next-line:no-empty
router.get('/tried-parameters', async (req: Request, res: Response) => {
});
}
private setErrorPathForFailedJob(jobInfo: TrialJobInfo): TrialJobInfo {
if (jobInfo === undefined || jobInfo.status !== 'FAILED' || jobInfo.logPath === undefined) {
return jobInfo;
}
jobInfo.stderrPath = path.join(jobInfo.logPath, '.nni', 'stderr');
return jobInfo;
}
}
export function createRestHandler(rs: RestServer): Router {
const handler: NNIRestHandler = new NNIRestHandler(rs);
return handler.createRestHandler();
}
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as bodyParser from 'body-parser';
import * as express from 'express';
import * as http from 'http';
import { Deferred } from 'ts-deferred';
import * as component from '../common/component';
import { getLogger, Logger } from '../common/log';
import { Manager } from '../common/manager';
import { createRestHandler } from './restHandler';
@component.Singleton
export class RestServer {
public static readonly DEFAULT_PORT: number = 51188;
private readonly API_ROOT_URL: string = '/api/v1/nni';
private hostName: string = '0.0.0.0';
private port: number = RestServer.DEFAULT_PORT;
private startTask!: Deferred<void>;
private stopTask!: Deferred<void>;
private app: express.Application = express();
private server!: http.Server;
private log: Logger = getLogger();
get endPoint(): string {
// tslint:disable-next-line:no-http-string
return `http://${this.hostName}:${this.port}`;
}
public start(port?: number, hostName?: string): Promise<void> {
if (this.startTask !== undefined) {
return this.startTask.promise;
}
this.startTask = new Deferred<void>();
this.registerRestHandler();
if (hostName) {
this.hostName = hostName;
}
if (port) {
this.port = port;
}
this.server = this.app.listen(this.port, this.hostName).on('listening', () => {
this.startTask.resolve();
}).on('error', (e: Error) => {
this.startTask.reject(e);
});
return this.startTask.promise;
}
public stop(): Promise<void> {
if (this.stopTask !== undefined) {
return this.stopTask.promise;
}
this.stopTask = new Deferred<void>();
if (this.startTask === undefined) {
this.stopTask.resolve();
return this.stopTask.promise;
} else {
this.startTask.promise.then(
() => { // Started
this.server.close().on('close', () => {
this.log.info('Rest server stopped.');
this.stopTask.resolve();
}).on('error', (e: Error) => {
this.log.error(`Error occurred stopping Rest server: ${e.message}`);
this.stopTask.reject();
});
},
() => { // Start task rejected
this.stopTask.resolve();
}
);
}
return this.stopTask.promise;
}
private registerRestHandler(): void {
this.app.use(bodyParser.json());
this.app.use(this.API_ROOT_URL, createRestHandler(this));
}
}
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as component from '../common/component';
import { DataStore, TrialJobInfo } from '../common/datastore';
import { NNIErrorNames } from '../common/errors';
import { getLogger, Logger } from '../common/log';
import { HostJobApplicationForm, TrainingService, TrialJobStatus } from '../common/trainingService';
export class TensorBoard {
private DEFAULT_PORT: number = 6006;
private TENSORBOARD_COMMAND: string = 'PATH=$PATH:~/.local/bin:/usr/local/bin tensorboard';
private tbJobMap: Map<string, string>;
private trainingService: TrainingService;
private dataStore: DataStore;
private log: Logger = getLogger();
constructor() {
this.tbJobMap = new Map();
this.trainingService = component.get(TrainingService);
this.dataStore = component.get(DataStore);
}
public async startTensorBoard(trialJobIds: string[], tbCmd?: string, port?: number): Promise<string> {
let tensorBoardPort: number = this.DEFAULT_PORT;
if (port !== undefined) {
tensorBoardPort = port;
}
const host: string = await this.getJobHost(trialJobIds);
const tbEndpoint: string = `http://${host}:${tensorBoardPort}`;
try {
if (await this.isTensorBoardRunningOnHost(host)) {
await this.stopHostTensorBoard(host);
}
} catch (error) {
if (error.name !== NNIErrorNames.NOT_FOUND) {
throw error;
} else {
this.tbJobMap.delete(host);
}
}
const logDirs: string[] = [];
for (const id of trialJobIds) {
logDirs.push(await this.getLogDir(id));
}
let tensorBoardCmd: string = this.TENSORBOARD_COMMAND;
if (tbCmd !== undefined && tbCmd.trim().length > 0) {
tensorBoardCmd = tbCmd;
}
const cmd: string = `${tensorBoardCmd} --logdir ${logDirs.join(':')} --port ${tensorBoardPort}`;
const form: HostJobApplicationForm = {
jobType: 'HOST',
host: host,
cmd: cmd
};
const jobId: string = (await this.trainingService.submitTrialJob(form)).id;
this.tbJobMap.set(host, jobId);
return tbEndpoint;
}
public async cleanUp(): Promise<void> {
const stopTensorBoardTasks: Promise<void>[] = [];
this.tbJobMap.forEach((jobId: string, host: string) => {
stopTensorBoardTasks.push(this.stopHostTensorBoard(host).catch((err: Error) => {
this.log.error(`Error occurred stopping tensorboard service: ${err.message}`);
}));
});
await Promise.all(stopTensorBoardTasks);
}
public stopTensorBoard(endPoint: string): Promise<void> {
const host: string = this.getEndPointHost(endPoint);
return this.stopHostTensorBoard(host);
}
private stopHostTensorBoard(host: string): Promise<void> {
const jobId: string | undefined = this.tbJobMap.get(host);
if (jobId === undefined) {
return Promise.resolve();
}
return this.trainingService.cancelTrialJob(jobId);
}
private async isTensorBoardRunningOnHost(host: string): Promise<boolean> {
const jobId: string | undefined = this.tbJobMap.get(host);
if (jobId === undefined) {
return false;
}
const status: TrialJobStatus = (await this.trainingService.getTrialJob(jobId)).status;
return ['RUNNING', 'WAITING'].includes(status);
}
private async getJobHost(trialJobIds: string[]): Promise<string> {
if (trialJobIds === undefined || trialJobIds.length < 1) {
throw new Error('No trail job specified.');
}
const jobInfo: TrialJobInfo = await this.dataStore.getTrialJob(trialJobIds[0]);
const logPath: string | undefined = jobInfo.logPath;
if (logPath === undefined) {
throw new Error(`Failed to find job logPath: ${jobInfo.id}`);
}
return logPath.split('://')[1].split(':')[0]; //TODO use url parse
}
private async getLogDir(trialJobId: string): Promise<string> {
const jobInfo: TrialJobInfo = await this.dataStore.getTrialJob(trialJobId);
const logPath: string | undefined = jobInfo.logPath;
if (logPath === undefined) {
throw new Error(`Failed to find job logPath: ${jobInfo.id}`);
}
return logPath.split('://')[1].split(':')[1]; //TODO use url parse
}
private getEndPointHost(endPoint: string): string {
const parts = endPoint.match(/.*:\/\/(.*):(.*)/);
if (parts !== null) {
return parts[1];
} else {
throw new Error(`Invalid endPoint: ${endPoint}`);
}
}
}
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { Deferred } from 'ts-deferred';
import { Provider } from 'typescript-ioc';
import { MetricDataRecord, MetricType, TrialJobInfo } from '../../common/datastore';
import { MethodNotImplementedError } from '../../common/errors';
import {
ExperimentParams, ExperimentProfile, Manager, ProfileUpdateType,
TrialJobStatistics
} from '../../common/manager';
import {
TrialJobApplicationForm, TrialJobDetail, TrialJobStatus
} from '../../common/trainingService';
export const testManagerProvider: Provider = {
get: (): Manager => { return new MockedNNIManager(); }
};
export class MockedNNIManager extends Manager {
public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType ): Promise<void> {
return Promise.resolve();
}
public getTrialJobStatistics(): Promise<TrialJobStatistics[]> {
const deferred: Deferred<TrialJobStatistics[]> = new Deferred<TrialJobStatistics[]>();
deferred.resolve([{
trialJobStatus: 'RUNNING',
trialJobNumber: 2
}, {
trialJobStatus: 'FAILED',
trialJobNumber: 1
}]);
return deferred.promise;
}
public addCustomizedTrialJob(hyperParams: string): Promise<void> {
return Promise.resolve();
}
public resumeExperiment(): Promise<void> {
return Promise.resolve();
}
public submitTrialJob(form: TrialJobApplicationForm): Promise<TrialJobDetail> {
const deferred: Deferred<TrialJobDetail> = new Deferred<TrialJobDetail>();
const jobDetail: TrialJobDetail = {
id: '1234',
status: 'RUNNING',
submitTime: new Date(),
startTime: new Date(),
endTime: new Date(),
tags: ['test'],
// tslint:disable-next-line:no-http-string
url: 'http://test',
workingDirectory: '/tmp/mocked',
form: {
jobType: 'TRIAL'
}
};
deferred.resolve(jobDetail);
return deferred.promise;
}
public cancelTrialJobByUser(trialJobId: string): Promise<void> {
return Promise.resolve();
}
public getClusterMetadata(key: string): Promise<string> {
return Promise.resolve('METAVALUE1');
}
public startExperiment(experimentParams: ExperimentParams): Promise<string> {
return Promise.resolve('id-1234');
}
public setClusterMetadata(key: string, value: string): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>();
if (key === 'exception_test_key') {
deferred.reject(new Error('Test Error'));
}
deferred.resolve();
return deferred.promise;
}
public getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
const deferred: Deferred<TrialJobDetail> = new Deferred<TrialJobDetail>();
const jobDetail: TrialJobDetail = {
id: '1234',
status: 'SUCCEEDED',
submitTime: new Date(),
startTime: new Date(),
endTime: new Date(),
tags: ['test'],
// tslint:disable-next-line:no-http-string
url: 'http://test',
workingDirectory: '/tmp/mocked',
form: {
jobType: 'TRIAL'
}
};
deferred.resolve(jobDetail);
return deferred.promise;
}
public stopExperiment(): Promise<void> {
throw new MethodNotImplementedError();
}
public getMetricData(trialJobId: string, metricType: MetricType): Promise<MetricDataRecord[]> {
throw new MethodNotImplementedError();
}
public getExperimentProfile(): Promise<ExperimentProfile> {
const profile: ExperimentProfile = {
params: {
authorName: 'test',
experimentName: 'exp1',
trialConcurrency: 2,
maxExecDuration: 30,
maxTrialNum: 3,
searchSpace: '{lr: 0.01}',
tuner: {
tunerCommand: 'python3 tuner.py',
tunerCwd: '/tmp/tunner',
tunerCheckpointDirectory: ''
}
},
id: '2345',
execDuration: 0,
startTime: new Date(),
endTime: new Date(),
revision: 0
};
return Promise.resolve(profile);
}
public listTrialJobs(status?: TrialJobStatus): Promise<TrialJobInfo[]> {
const job1: TrialJobInfo = {
id: '1234',
status: 'SUCCEEDED',
startTime: new Date(),
endTime: new Date(),
finalMetricData: 'lr: 0.01, val accuracy: 0.89, batch size: 256'
};
const job2: TrialJobInfo = {
id: '3456',
status: 'FAILED',
startTime: new Date(),
endTime: new Date(),
finalMetricData: ''
};
return Promise.resolve([job1, job2]);
}
}
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
// tslint:disable-next-line:no-implicit-dependencies
import { assert, expect } from 'chai';
// tslint:disable-next-line:no-implicit-dependencies
import * as request from 'request';
import { Container } from 'typescript-ioc';
import * as component from '../../common/component';
import { DataStore } from '../../common/datastore';
import { ExperimentProfile, Manager } from '../../common/manager';
import { TrainingService } from '../../common/trainingService';
import { cleanupUnitTest, prepareUnitTest } from '../../common/utils';
import { MockedDataStore } from '../../core/test/mockedDatastore';
import { MockedTrainingService } from '../../core/test/mockedTrainingService';
import { RestServer } from '../server';
import { testManagerProvider } from './mockedNNIManager';
describe('Unit test for rest server', () => {
let ROOT_URL: string;
before((done: Mocha.Done) => {
prepareUnitTest();
Container.bind(Manager).provider(testManagerProvider);
Container.bind(DataStore).to(MockedDataStore);
Container.bind(TrainingService).to(MockedTrainingService);
const restServer: RestServer = component.get(RestServer);
restServer.start().then(() => {
ROOT_URL = `${restServer.endPoint}/api/v1/nni`;
done();
}).catch((e: Error) => {
assert.fail(`Failed to start rest server: ${e.message}`);
});
});
after(() => {
component.get<RestServer>(RestServer).stop();
cleanupUnitTest();
});
it('Test GET check-status', (done: Mocha.Done) => {
request.get(`${ROOT_URL}/check-status`, (err: Error, res: request.Response) => {
if (err) {
assert.fail(err.message);
} else {
expect(res.statusCode).to.equal(200);
}
done();
});
});
it('Test GET trial-jobs/:id', (done: Mocha.Done) => {
// tslint:disable-next-line:no-any
request.get(`${ROOT_URL}/trial-jobs/1234`, (err: Error, res: request.Response, body: any) => {
if (err) {
assert.fail(err.message);
} else {
expect(res.statusCode).to.equal(200);
expect(JSON.parse(body).id).to.equal('1234');
}
done();
});
});
it('Test GET experiment', (done: Mocha.Done) => {
request.get(`${ROOT_URL}/experiment`, (err: Error, res: request.Response) => {
if (err) {
assert.fail(err.message);
} else {
expect(res.statusCode).to.equal(200);
}
done();
});
});
it('Test GET trial-jobs', (done: Mocha.Done) => {
request.get(`${ROOT_URL}/trial-jobs`, (err: Error, res: request.Response) => {
expect(res.statusCode).to.equal(200);
if (err) {
assert.fail(err.message);
}
done();
});
});
it('Test change concurrent-trial-jobs', (done: Mocha.Done) => {
// tslint:disable-next-line:no-any
request.get(`${ROOT_URL}/experiment`, (err: Error, res: request.Response, body: any) => {
if (err) {
assert.fail(err.message);
} else {
expect(res.statusCode).to.equal(200);
const profile: ExperimentProfile = JSON.parse(body);
if (profile.params && profile.params.trialConcurrency) {
profile.params.trialConcurrency = 10;
}
const req: request.Options = {
uri: `${ROOT_URL}/experiment`,
method: 'PUT',
json: true,
body: profile
};
request(req, (error: Error, response: request.Response) => {
if (error) {
assert.fail(error.message);
} else {
expect(response.statusCode).to.equal(200);
}
done();
});
}
});
});
it('Test PUT experiment/cluster-metadata exception', (done: Mocha.Done) => {
const req: request.Options = {
uri: `${ROOT_URL}/experiment/cluster-metadata`,
method: 'PUT',
json: true,
body: {
exception_test_key: 'test'
}
}
request(req, (err: Error, res: request.Response) => {
if (err) {
assert.fail(err.message);
} else {
expect(res.statusCode).to.equal(500);
}
done();
});
});
it('Test PUT experiment/cluster-metadata', (done: Mocha.Done) => {
const req: request.Options = {
uri: `${ROOT_URL}/experiment/cluster-metadata`,
method: 'PUT',
json: true,
body: {
MACHINE_LIST: [{
ip: '10.10.10.101',
port: 22,
username: 'test',
passwd: '1234'
}, {
ip: '10.10.10.102',
port: 22,
username: 'test',
passwd: '1234'
}]
}
}
request(req, (err: Error, res: request.Response) => {
if (err) {
assert.fail(err.message);
} else {
expect(res.statusCode).to.equal(200);
}
done();
});
});
it('Test POST experiment', (done: Mocha.Done) => {
const req: request.Options = {
uri: `${ROOT_URL}/experiment`,
method: 'POST',
json: true,
body: {
author: 'test',
trial: {
entrypoint: 'python',
args: 'mnist.py'
}
}
};
// tslint:disable-next-line:no-any
request(req, (err: Error, res: request.Response, body: any) => {
if (err) {
assert.fail(err.message);
} else {
expect(res.statusCode).to.equal(200);
expect(body.experiment_id).to.equal('id-1234');
}
done();
});
});
});
#!/usr/bin/python
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import json
import os
import subprocess
import sys
import time
from xml.dom import minidom
def check_ready_to_run():
pgrep_output =subprocess.check_output('pgrep -fx \'python3 gpu_metrics_collector.py\'', shell=True)
pidList = []
for pid in pgrep_output.splitlines():
pidList.append(int(pid))
pidList.remove(os.getpid())
return len(pidList) == 0
def main(argv):
if check_ready_to_run() == False:
# GPU metrics collector is already running. Exit
exit(2)
with open("./gpu_metrics", "w") as outputFile:
pass
os.chmod("./gpu_metrics", 0o777)
cmd = 'nvidia-smi -q -x'
while(True):
try:
smi_output = subprocess.check_output(cmd, shell=True)
parse_nvidia_smi_result(smi_output, '.')
except:
exception = sys.exc_info()
for e in exception:
print("job exporter error {}".format(e))
# TODO: change to sleep time configurable via arguments
time.sleep(5)
def parse_nvidia_smi_result(smi, outputDir):
try:
xmldoc = minidom.parseString(smi)
gpuList = xmldoc.getElementsByTagName('gpu')
with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
outPut = {}
outPut["Timestamp"] = time.asctime(time.localtime())
outPut["gpuCount"] = len(gpuList)
outPut["gpuInfos"] = []
for gpuIndex, gpu in enumerate(gpuList):
gpuInfo ={}
gpuInfo['index'] = gpuIndex
gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0].getElementsByTagName('gpu_util')[0].childNodes[0].data.replace("%", "").strip()
gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0].getElementsByTagName('memory_util')[0].childNodes[0].data.replace("%", "").strip()
processes = gpu.getElementsByTagName('processes')
runningProNumber = len(processes[0].getElementsByTagName('process_info'))
gpuInfo['activeProcessNum'] = runningProNumber
outPut["gpuInfos"].append(gpuInfo)
print(outPut)
outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
outputFile.flush();
except :
e_info = sys.exc_info()
print('xmldoc paring error')
if __name__ == "__main__":
main(sys.argv[1:])
# ============================================================================================================================== #
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ============================================================================================================================== #
import argparse
import errno
import json
import os
import re
METRICS_FILENAME = '.nni/metrics'
OFFSET_FILENAME = '.nni/metrics_offset'
JOB_CODE_FILENAME = '.nni/code'
JOB_PID_FILENAME = '.nni/jobpid'
JOB_CODE_PATTERN = re.compile('^(\d+)\s+(\d+)$')
LEN_FIELD_SIZE = 6
MAGIC = 'ME'
class TrialMetricsReader():
'''
Read metrics data from a trial job
'''
def __init__(self, trial_job_dir):
self.trial_job_dir = trial_job_dir
self.offset_filename = os.path.join(trial_job_dir, OFFSET_FILENAME)
self.metrics_filename = os.path.join(trial_job_dir, METRICS_FILENAME)
self.jobcode_filename = os.path.join(trial_job_dir, JOB_CODE_FILENAME)
self.jobpid_filemame = os.path.join(trial_job_dir, JOB_PID_FILENAME)
def _metrics_file_is_empty(self):
if not os.path.isfile(self.metrics_filename):
return True
statinfo = os.stat(self.metrics_filename)
return statinfo.st_size == 0
def _get_offset(self):
offset = 0
if os.path.isfile(self.offset_filename):
with open(self.offset_filename, 'r') as f:
offset = int(f.readline())
return offset
def _write_offset(self, offset):
statinfo = os.stat(self.metrics_filename)
if offset < 0 or offset > statinfo.st_size:
raise ValueError('offset value is invalid: {}'.format(offset))
with open(self.offset_filename, 'w') as f:
f.write(str(offset)+'\n')
def _read_all_available_records(self, offset):
new_offset = offset
metrics = []
with open(self.metrics_filename, 'r') as f:
f.seek(offset)
while True:
magic_string = f.read(len(MAGIC))
# empty data means EOF
if not magic_string:
break
strdatalen = f.read(LEN_FIELD_SIZE)
# empty data means EOF
if not strdatalen:
raise ValueError("metric file {} format error after offset: {}.".format(self.metrics_filename, new_offset))
datalen = int(strdatalen)
data = f.read(datalen)
if datalen > 0 and len(data) == datalen:
new_offset = f.tell()
metrics.append(data)
else:
raise ValueError("metric file {} format error after offset: {}.".format(self.metrics_filename, new_offset))
self._write_offset(new_offset)
return metrics
def _pid_exists(selft, pid):
if pid < 0:
return False
if pid == 0:
# According to "man 2 kill" PID 0 refers to every process
# in the process group of the calling process.
# On certain systems 0 is a valid PID but we have no way
# to know that in a portable fashion.
raise ValueError('invalid PID 0')
try:
os.kill(pid, 0)
except OSError as err:
if err.errno == errno.ESRCH:
# ESRCH == No such process
return False
elif err.errno == errno.EPERM:
# EPERM clearly means there's a process to deny access to
return True
else:
# According to "man 2 kill" possible error values are
# (EINVAL, EPERM, ESRCH)
raise
else:
return True
def read_trial_metrics(self):
'''
Read available metrics data for a trial
'''
if self._metrics_file_is_empty():
return []
offset = self._get_offset()
return self._read_all_available_records(offset)
def read_trial_status(self):
if os.path.isfile(self.jobpid_filemame):
with open(self.jobpid_filemame, 'r') as f:
jobpid = int(f.readline())
if self._pid_exists(jobpid):
return 'RUNNING' ,-1
else:
return self._read_job_return_code()
else:
# raise ValueError('offset value is invalid: {}'.format(offset))
return 'UNKNOWN' ,-1
def _read_job_return_code(self):
if os.path.isfile(self.jobcode_filename):
with open(self.jobcode_filename, 'r') as f:
job_return_code = f.readline()
match = JOB_CODE_PATTERN.match(job_return_code)
if(match):
return_code = int(match.group(1))
timestamp = int(match.group(2))
status = ''
if return_code == 0:
status = 'SUCCEEDED'
elif return_code == 141:
status = 'USER_CANCELED'
else:
status = 'FAILED'
return status, timestamp
else:
raise ValueError('Job code file format incorrect')
else:
raise ValueError('job return code file doesnt exist: {}'.format(self.jobcode_filename))
def read_experiment_metrics(args):
'''
Read metrics data for specified trial jobs
'''
trial_job_ids = args.trial_job_ids.strip().split(',')
trial_job_ids = [id.strip() for id in trial_job_ids]
results = []
for trial_job_id in trial_job_ids:
result = {}
try:
trial_job_dir = os.path.join(args.experiment_dir, 'trials', trial_job_id)
reader = TrialMetricsReader(trial_job_dir)
result['jobId'] = trial_job_id
result['metrics'] = reader.read_trial_metrics()
result['jobStatus'], result['endTimestamp'] = reader.read_trial_status()
results.append(result)
except Exception:
#TODO error logging to file
pass
print(json.dumps(results))
if __name__ == '__main__':
PARSER = argparse.ArgumentParser()
PARSER.add_argument("--experiment_dir", type=str, help="Root directory of experiment", required=True)
PARSER.add_argument("--trial_job_ids", type=str, help="Trial job ids splited with ','", required=True)
ARGS, UNKNOWN = PARSER.parse_known_args()
read_experiment_metrics(ARGS)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment