Unverified Commit 5c634149 authored by chicm-ms's avatar chicm-ms Committed by GitHub
Browse files

Fix bug: some trial jobs hyper params not stored and error handling updates (#225)

* Pull latest code (#2)

* webui logpath and document (#135)

* Add webui document and logpath as a href

* fix tslint

* fix comments by Chengmin

* Pai training service bug fix and enhancement (#136)

* Add NNI installation scripts

* Update pai script, update NNI_out_dir

* Update NNI dir in nni sdk local.py

* Create .nni folder in nni sdk local.py

* Add check before creating .nni folder

* Fix typo for PAI_INSTALL_NNI_SHELL_FORMAT

* Improve annotation (#138)

* Improve annotation

* Minor bugfix

* Selectively install through pip (#139)

Selectively install through pip 
* update setup.py

* fix paiTrainingService bugs (#137)

* fix nnictl bug

* add hdfs host validation

* fix bugs

* fix dockerfile

* fix install.sh

* update install.sh

* fix dockerfile

* Set timeout for HDFSUtility exists function

* remove unused TODO

* fix sdk

* add optional for outputDir and dataDir

* refactor dockerfile.base

* Remove unused import in hdfsclientUtility

* Add documentation for NNI PAI mode experiment (#141)

* Add documentation for NNI PAI mode

* Fix typo based on PR comments

* Exit with subprocess return code of trial keeper

* Remove additional exit code

* Fix typo based on PR comments

* update doc for smac tuner (#140)

* Revert "Selectively install through pip (#139)" due to potential pip install issue (#142)

* Revert "Selectively install through pip (#139)"

This reverts commit 1d174836.

* Add exit code of subprocess for trial_keeper

* Update README, add link to PAImode doc

* fix bug (#147)

* Refactor nnictl and add config_pai.yml (#144)

* fix nnictl bug

* add hdfs host validation

* fix bugs

* fix dockerfile

* fix install.sh

* update install.sh

* fix dockerfile

* Set timeout for HDFSUtility exists function

* remove unused TODO

* fix sdk

* add optional for outputDir and dataDir

* refactor dockerfile.base

* Remove unused import in hdfsclientUtility

* add config_pai.yml

* refactor nnictl create logic and add colorful print

* fix nnictl stop logic

* add annotation for config_pai.yml

* add document for start experiment

* fix config.yml

* fix document

* Fix trial keeper wrongly exit issue (#152)

* Fix trial keeper bug, use actual exitcode to exit rather than 1

* Fix bug of table sort (#145)

* Update doc for PAIMode and v0.2 release notes (#153)

* Update v0.2 documentation regards to release note and PAI training service

* Update document to describe NNI docker image

* Bug fix for SQuAD example tuner. (#134)

* Update Makefile (#151)

* test

* update setup.py

* update Makefile and install.sh

* rever setup.py

* change color

* update doc

* update doc

* fix auto-completion's extra space

* update Makefile

* update webui

* Update doc image (#163)

* update doc

* trivial

* trivial

* trivial

* trivial

* trivial

* trivial

* update image

* update image size

* Update ga squad (#104)

* update readme in ga_squad

* update readme

* fix typo

* Update README.md

* Update README.md

* Update README.md

* update readme

* sklearn examples (#169)

* fix nnictl bug

* fix install.sh

* add sklearn-regression example

* add sklearn classification

* update sklearn

* update example

* remove additional code

* Update batch tuner (#158)

* update readme in ga_squad

* update readme

* fix typo

* Update README.md

* Update README.md

* Update README.md

* update readme

* update batch tuner

* Quickly fix cascading search space bug in tuner (#156)

* update readme in ga_squad

* update readme

* fix typo

* Update README.md

* Update README.md

* Update README.md

* update readme

* quickly fix cascading searchspace bug in tuner

* Add iterative search space example (#119)

* update readme in ga_squad

* update readme

* fix typo

* Update README.md

* Update README.md

* Update README.md

* update readme

* add iterative search space example

* update

* update readme

* change name

* Fix bug: some trial jobs hyper params not stored when fast finished

* updates

* updates

* updates

* updates
parent ad88e3a3
......@@ -26,9 +26,12 @@ export namespace NNIErrorNames {
}
export class NNIError extends Error {
constructor (name: string, message: string) {
constructor (name: string, message: string, err?: Error) {
super(message);
this.name = name;
if (err !== undefined) {
this.stack = err.stack;
}
}
}
......
......@@ -23,6 +23,7 @@ import * as assert from 'assert';
import { ChildProcess } from 'child_process';
import { EventEmitter } from 'events';
import { Readable, Writable } from 'stream';
import { NNIError } from '../common/errors';
import { getLogger, Logger } from '../common/log';
import * as CommandType from './commands';
......@@ -98,10 +99,15 @@ class IpcInterface {
public sendCommand(commandType: string, content: string = ''): void {
this.logger.debug(`ipcInterface command type: [${commandType}], content:[${content}]`);
assert.ok(this.acceptCommandTypes.has(commandType));
try {
const data: Buffer = encodeCommand(commandType, content);
if (!this.outgoingStream.write(data)) {
this.logger.error('Commands jammed in buffer!');
}
} catch (err) {
throw new NNIError('Dispatcher Error', `Dispatcher Error: ${err.message}`, err);
}
}
/**
......
......@@ -25,8 +25,8 @@ import { Deferred } from 'ts-deferred';
import * as component from '../common/component';
import { Database, DataStore, MetricData, MetricDataRecord, MetricType,
TrialJobEvent, TrialJobEventRecord, TrialJobInfo } from '../common/datastore';
import { isNewExperiment } from '../common/experimentStartupInfo';
import { getExperimentId } from '../common/experimentStartupInfo';
import { NNIError } from '../common/errors';
import { getExperimentId, isNewExperiment } from '../common/experimentStartupInfo';
import { getLogger, Logger } from '../common/log';
import { ExperimentProfile, TrialJobStatistics } from '../common/manager';
import { TrialJobStatus } from '../common/trainingService';
......@@ -72,7 +72,11 @@ class NNIDataStore implements DataStore {
}
public async storeExperimentProfile(experimentProfile: ExperimentProfile): Promise<void> {
try {
await this.db.storeExperimentProfile(experimentProfile);
} catch (err) {
throw new NNIError('Datastore error', `Datastore error: ${err.message}`, err);
}
}
public getExperimentProfile(experimentId: string): Promise<ExperimentProfile> {
......@@ -82,7 +86,11 @@ class NNIDataStore implements DataStore {
public storeTrialJobEvent(event: TrialJobEvent, trialJobId: string, data?: string, logPath?: string): Promise<void> {
this.log.debug(`storeTrialJobEvent: event: ${event}, data: ${data}, logpath: ${logPath}`);
return this.db.storeTrialJobEvent(event, trialJobId, data, logPath);
return this.db.storeTrialJobEvent(event, trialJobId, data, logPath).catch(
(err: Error) => {
throw new NNIError('Datastore error', `Datastore error: ${err.message}`, err);
}
);
}
public async getTrialJobStatistics(): Promise<any[]> {
......@@ -128,6 +136,7 @@ class NNIDataStore implements DataStore {
return;
}
assert(trialJobId === metrics.trial_job_id);
try {
await this.db.storeMetricData(trialJobId, JSON.stringify({
trialJobId: metrics.trial_job_id,
parameterId: metrics.parameter_id,
......@@ -136,6 +145,9 @@ class NNIDataStore implements DataStore {
data: metrics.value,
timestamp: Date.now()
}));
} catch (err) {
throw new NNIError('Datastore error', `Datastore error: ${err.message}`, err);
}
}
public getMetricData(trialJobId?: string, metricType?: MetricType): Promise<MetricDataRecord[]> {
......
......@@ -25,6 +25,7 @@ import { ChildProcess, spawn } from 'child_process';
import { Deferred } from 'ts-deferred';
import * as component from '../common/component';
import { DataStore, MetricDataRecord, MetricType, TrialJobInfo } from '../common/datastore';
import { NNIError } from '../common/errors';
import { getExperimentId } from '../common/experimentStartupInfo';
import { getLogger, Logger } from '../common/log';
import {
......@@ -122,7 +123,7 @@ class NNIManager implements Manager {
}
const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.multiPhase);
console.log(`dispatcher command: ${dispatcherCommand}`);
this.log.debug(`dispatcher command: ${dispatcherCommand}`);
this.setupTuner(
//expParams.tuner.tunerCommand,
dispatcherCommand,
......@@ -136,6 +137,7 @@ class NNIManager implements Manager {
this.run().catch((err: Error) => {
this.criticalError(err);
});
return this.experimentProfile.id;
}
......@@ -146,12 +148,12 @@ class NNIManager implements Manager {
const expParams: ExperimentParams = this.experimentProfile.params;
// Set up multiphase config
if(expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) {
if (expParams.multiPhase && this.trainingService.isMultiPhaseJobSupported) {
this.trainingService.setClusterMetadata('multiPhase', expParams.multiPhase.toString());
}
const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.multiPhase);
console.log(`dispatcher command: ${dispatcherCommand}`);
this.log.debug(`dispatcher command: ${dispatcherCommand}`);
this.setupTuner(
dispatcherCommand,
undefined,
......@@ -369,8 +371,12 @@ class NNIManager implements Manager {
await Promise.all([
this.periodicallyUpdateExecDuration(),
this.trainingService.run(),
this.trialJobsMaintainer.run()]);
this.trainingService.run().catch((err: Error) => {
throw new NNIError('Training service error', `Training service error: ${err.message}`, err);
}),
this.trialJobsMaintainer.run().catch((err: Error) => {
throw new NNIError('Job maintainer error', `Job maintainer error: ${err.message}`, err);
})]);
}
private addEventListeners(): void {
......@@ -380,26 +386,26 @@ class NNIManager implements Manager {
}
this.trainingService.addTrialJobMetricListener((metric: TrialJobMetric) => {
this.onTrialJobMetrics(metric).catch((err: Error) => {
this.criticalError(err);
this.criticalError(new NNIError('Job metrics error', `Job metrics error: ${err.message}`, err));
});
});
this.trialJobsMaintainer.on(async (event: TrialJobMaintainerEvent, trialJobDetail: TrialJobDetail) => {
this.onTrialJobEvent(event, trialJobDetail).catch((err: Error) => {
this.criticalError(err);
this.criticalError(new NNIError('Trial job event error', `Trial job event error: ${err.message}`, err));
});
});
this.dispatcher.onCommand((commandType: string, content: string) => {
this.onTunerCommand(commandType, content).catch((err: Error) => {
this.criticalError(err);
this.criticalError(new NNIError('Tuner command event error', `Tuner command event error: ${err.message}`, err));
});
});
}
private sendInitTunerCommands(): void {
if (this.dispatcher === undefined) {
throw new Error('Error: tuner has not been setup');
throw new Error('Dispatcher error: tuner has not been setup');
}
// TO DO: we should send INITIALIZE command to tuner if user's tuner needs to run init method in tuner
this.log.debug(`Send tuner command: update search space: ${this.experimentProfile.params.searchSpace}`);
......@@ -479,9 +485,13 @@ class NNIManager implements Manager {
};
const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(trialJobAppForm);
this.trialJobsMaintainer.setTrialJob(trialJobDetail.id, Object.assign({}, trialJobDetail));
// TO DO: to uncomment
assert(trialJobDetail.status === 'WAITING');
await this.dataStore.storeTrialJobEvent(trialJobDetail.status, trialJobDetail.id, content, trialJobDetail.url);
const jobDetailSnapshot: TrialJobDetail | undefined = this.trialJobsMaintainer.getTrialJob(trialJobDetail.id);
if (jobDetailSnapshot !== undefined) {
await this.dataStore.storeTrialJobEvent(
jobDetailSnapshot.status, jobDetailSnapshot.id, content, jobDetailSnapshot.url);
} else {
assert(false, `undefined jobdetail in job maintainer: ${trialJobDetail.id}`);
}
if (this.currSubmittedTrialNum === this.experimentProfile.params.maxTrialNum) {
this.trialJobsMaintainer.setNoMoreTrials();
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment