Unverified Commit 99f7d79c authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Support experiment view (#1524)

parent 0b7d6260
...@@ -10,6 +10,7 @@ nnictl support commands: ...@@ -10,6 +10,7 @@ nnictl support commands:
* [nnictl create](#create) * [nnictl create](#create)
* [nnictl resume](#resume) * [nnictl resume](#resume)
* [nnictl view](#view)
* [nnictl stop](#stop) * [nnictl stop](#stop)
* [nnictl update](#update) * [nnictl update](#update)
* [nnictl trial](#trial) * [nnictl trial](#trial)
...@@ -104,6 +105,35 @@ Debug mode will disable version check function in Trialkeeper. ...@@ -104,6 +105,35 @@ Debug mode will disable version check function in Trialkeeper.
nnictl resume [experiment_id] --port 8088 nnictl resume [experiment_id] --port 8088
``` ```
<a name="view"></a>
![](https://placehold.it/15/1589F0/000000?text=+) `nnictl view`
* Description
You can use this command to view a stopped experiment.
* Usage
```bash
nnictl view [OPTIONS]
```
* Options
|Name, shorthand|Required|Default|Description|
|------|------|------ |------|
|id| True| |The id of the experiment you want to view|
|--port, -p| False| |Rest port of the experiment you want to view|
* Example
> view an experiment with specified port 8088
```bash
nnictl view [experiment_id] --port 8088
```
<a name="stop"></a> <a name="stop"></a>
![](https://placehold.it/15/1589F0/000000?text=+) `nnictl stop` ![](https://placehold.it/15/1589F0/000000?text=+) `nnictl stop`
......
...@@ -33,11 +33,11 @@ class ExperimentStartupInfo { ...@@ -33,11 +33,11 @@ class ExperimentStartupInfo {
private initTrialSequenceID: number = 0; private initTrialSequenceID: number = 0;
private logDir: string = ''; private logDir: string = '';
private logLevel: string = ''; private logLevel: string = '';
private readonly: boolean = false;
public setStartupInfo(newExperiment: boolean, experimentId: string, basePort: number, logDir?: string, logLevel?: string): void { public setStartupInfo(newExperiment: boolean, experimentId: string, basePort: number, logDir?: string, logLevel?: string, readonly?: boolean): void {
assert(!this.initialized); assert(!this.initialized);
assert(experimentId.trim().length > 0); assert(experimentId.trim().length > 0);
this.newExperiment = newExperiment; this.newExperiment = newExperiment;
this.experimentId = experimentId; this.experimentId = experimentId;
this.basePort = basePort; this.basePort = basePort;
...@@ -52,6 +52,10 @@ class ExperimentStartupInfo { ...@@ -52,6 +52,10 @@ class ExperimentStartupInfo {
if (logLevel !== undefined && logLevel.length > 1) { if (logLevel !== undefined && logLevel.length > 1) {
this.logLevel = logLevel; this.logLevel = logLevel;
} }
if (readonly !== undefined) {
this.readonly = readonly;
}
} }
public getExperimentId(): string { public getExperimentId(): string {
...@@ -84,6 +88,12 @@ class ExperimentStartupInfo { ...@@ -84,6 +88,12 @@ class ExperimentStartupInfo {
return this.logLevel; return this.logLevel;
} }
public isReadonly(): boolean {
assert(this.initialized);
return this.readonly;
}
public setInitTrialSequenceId(initSequenceId: number): void { public setInitTrialSequenceId(initSequenceId: number): void {
assert(this.initialized); assert(this.initialized);
this.initTrialSequenceID = initSequenceId; this.initTrialSequenceID = initSequenceId;
...@@ -121,10 +131,14 @@ function getExperimentStartupInfo(): ExperimentStartupInfo { ...@@ -121,10 +131,14 @@ function getExperimentStartupInfo(): ExperimentStartupInfo {
} }
function setExperimentStartupInfo( function setExperimentStartupInfo(
newExperiment: boolean, experimentId: string, basePort: number, logDir?: string, logLevel?: string): void { newExperiment: boolean, experimentId: string, basePort: number, logDir?: string, logLevel?: string, readonly?: boolean): void {
component.get<ExperimentStartupInfo>(ExperimentStartupInfo) component.get<ExperimentStartupInfo>(ExperimentStartupInfo)
.setStartupInfo(newExperiment, experimentId, basePort, logDir, logLevel); .setStartupInfo(newExperiment, experimentId, basePort, logDir, logLevel, readonly);
}
function isReadonly(): boolean {
return component.get<ExperimentStartupInfo>(ExperimentStartupInfo).isReadonly();
} }
export { ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment, getExperimentStartupInfo, export { ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment, getExperimentStartupInfo,
setExperimentStartupInfo, setInitTrialSequenceId, getInitTrialSequenceId }; setExperimentStartupInfo, setInitTrialSequenceId, getInitTrialSequenceId, isReadonly };
...@@ -26,7 +26,7 @@ import { Writable } from 'stream'; ...@@ -26,7 +26,7 @@ import { Writable } from 'stream';
import { WritableStreamBuffer } from 'stream-buffers'; import { WritableStreamBuffer } from 'stream-buffers';
import { format } from 'util'; import { format } from 'util';
import * as component from '../common/component'; import * as component from '../common/component';
import { getExperimentStartupInfo } from './experimentStartupInfo'; import { getExperimentStartupInfo, isReadonly } from './experimentStartupInfo';
import { getLogDir } from './utils'; import { getLogDir } from './utils';
const FATAL: number = 1; const FATAL: number = 1;
...@@ -76,6 +76,7 @@ class Logger { ...@@ -76,6 +76,7 @@ class Logger {
private level: number = INFO; private level: number = INFO;
private bufferSerialEmitter: BufferSerialEmitter; private bufferSerialEmitter: BufferSerialEmitter;
private writable: Writable; private writable: Writable;
private readonly: boolean = false;
constructor(fileName?: string) { constructor(fileName?: string) {
let logFile: string | undefined = fileName; let logFile: string | undefined = fileName;
...@@ -95,6 +96,8 @@ class Logger { ...@@ -95,6 +96,8 @@ class Logger {
if (logLevel !== undefined) { if (logLevel !== undefined) {
this.level = logLevel; this.level = logLevel;
} }
this.readonly = isReadonly();
} }
public close() { public close() {
...@@ -134,14 +137,21 @@ class Logger { ...@@ -134,14 +137,21 @@ class Logger {
public fatal(...param: any[]): void { public fatal(...param: any[]): void {
this.log('FATAL', param); this.log('FATAL', param);
} }
/**
* if the experiment is not in readonly mode, write log content to stream
* @param level log level
* @param param the params to be written
*/
private log(level: string, param: any[]): void { private log(level: string, param: any[]): void {
const buffer: WritableStreamBuffer = new WritableStreamBuffer(); if (!this.readonly) {
buffer.write(`[${(new Date()).toLocaleString()}] ${level} `); const buffer: WritableStreamBuffer = new WritableStreamBuffer();
buffer.write(format(param)); buffer.write(`[${(new Date()).toLocaleString()}] ${level} `);
buffer.write('\n'); buffer.write(format(param));
buffer.end(); buffer.write('\n');
this.bufferSerialEmitter.feed(buffer.getContents()); buffer.end();
this.bufferSerialEmitter.feed(buffer.getContents());
}
} }
} }
......
...@@ -24,6 +24,10 @@ import { TrialJobStatus } from './trainingService'; ...@@ -24,6 +24,10 @@ import { TrialJobStatus } from './trainingService';
type ProfileUpdateType = 'TRIAL_CONCURRENCY' | 'MAX_EXEC_DURATION' | 'SEARCH_SPACE' | 'MAX_TRIAL_NUM'; type ProfileUpdateType = 'TRIAL_CONCURRENCY' | 'MAX_EXEC_DURATION' | 'SEARCH_SPACE' | 'MAX_TRIAL_NUM';
type ExperimentStatus = 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL'; type ExperimentStatus = 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL';
namespace ExperimentStartUpMode {
export const NEW = 'new';
export const RESUME = 'resume';
}
interface ExperimentParams { interface ExperimentParams {
authorName: string; authorName: string;
...@@ -95,7 +99,7 @@ interface NNIManagerStatus { ...@@ -95,7 +99,7 @@ interface NNIManagerStatus {
abstract class Manager { abstract class Manager {
public abstract startExperiment(experimentParams: ExperimentParams): Promise<string>; public abstract startExperiment(experimentParams: ExperimentParams): Promise<string>;
public abstract resumeExperiment(): Promise<void>; public abstract resumeExperiment(readonly: boolean): Promise<void>;
public abstract stopExperiment(): Promise<void>; public abstract stopExperiment(): Promise<void>;
public abstract getExperimentProfile(): Promise<ExperimentProfile>; public abstract getExperimentProfile(): Promise<ExperimentProfile>;
public abstract updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void>; public abstract updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void>;
...@@ -115,4 +119,4 @@ abstract class Manager { ...@@ -115,4 +119,4 @@ abstract class Manager {
public abstract getStatus(): NNIManagerStatus; public abstract getStatus(): NNIManagerStatus;
} }
export { Manager, ExperimentParams, ExperimentProfile, TrialJobStatistics, ProfileUpdateType, NNIManagerStatus, ExperimentStatus }; export { Manager, ExperimentParams, ExperimentProfile, TrialJobStatistics, ProfileUpdateType, NNIManagerStatus, ExperimentStatus, ExperimentStartUpMode };
...@@ -59,6 +59,7 @@ class NNIManager implements Manager { ...@@ -59,6 +59,7 @@ class NNIManager implements Manager {
private waitingTrials: string[]; private waitingTrials: string[];
private trialJobs: Map<string, TrialJobDetail>; private trialJobs: Map<string, TrialJobDetail>;
private trialDataForTuner: string; private trialDataForTuner: string;
private readonly: boolean;
private trialJobMetricListener: (metric: TrialJobMetric) => void; private trialJobMetricListener: (metric: TrialJobMetric) => void;
...@@ -72,6 +73,7 @@ class NNIManager implements Manager { ...@@ -72,6 +73,7 @@ class NNIManager implements Manager {
this.waitingTrials = []; this.waitingTrials = [];
this.trialJobs = new Map<string, TrialJobDetail>(); this.trialJobs = new Map<string, TrialJobDetail>();
this.trialDataForTuner = ''; this.trialDataForTuner = '';
this.readonly = false;
this.log = getLogger(); this.log = getLogger();
this.dataStore = component.get(DataStore); this.dataStore = component.get(DataStore);
...@@ -88,6 +90,9 @@ class NNIManager implements Manager { ...@@ -88,6 +90,9 @@ class NNIManager implements Manager {
} }
public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void> { public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void> {
if (this.readonly) {
return Promise.reject(new Error('Error: can not update experiment profile in readonly mode!'));
}
switch (updateType) { switch (updateType) {
case 'TRIAL_CONCURRENCY': case 'TRIAL_CONCURRENCY':
this.updateTrialConcurrency(experimentProfile.params.trialConcurrency); this.updateTrialConcurrency(experimentProfile.params.trialConcurrency);
...@@ -109,6 +114,9 @@ class NNIManager implements Manager { ...@@ -109,6 +114,9 @@ class NNIManager implements Manager {
} }
public importData(data: string): Promise<void> { public importData(data: string): Promise<void> {
if (this.readonly) {
return Promise.reject(new Error('Error: can not import data in readonly mode!'));
}
if (this.dispatcher === undefined) { if (this.dispatcher === undefined) {
return Promise.reject( return Promise.reject(
new Error('tuner has not been setup') new Error('tuner has not been setup')
...@@ -124,6 +132,9 @@ class NNIManager implements Manager { ...@@ -124,6 +132,9 @@ class NNIManager implements Manager {
} }
public addCustomizedTrialJob(hyperParams: string): Promise<void> { public addCustomizedTrialJob(hyperParams: string): Promise<void> {
if (this.readonly) {
return Promise.reject(new Error('Error: can not add customized trial job in readonly mode!'));
}
if (this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) { if (this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
return Promise.reject( return Promise.reject(
new Error('reach maxTrialNum') new Error('reach maxTrialNum')
...@@ -136,6 +147,9 @@ class NNIManager implements Manager { ...@@ -136,6 +147,9 @@ class NNIManager implements Manager {
} }
public async cancelTrialJobByUser(trialJobId: string): Promise<void> { public async cancelTrialJobByUser(trialJobId: string): Promise<void> {
if (this.readonly) {
return Promise.reject(new Error('Error: can not cancel trial job in readonly mode!'));
}
this.log.info(`User cancelTrialJob: ${trialJobId}`); this.log.info(`User cancelTrialJob: ${trialJobId}`);
await this.trainingService.cancelTrialJob(trialJobId); await this.trainingService.cancelTrialJob(trialJobId);
await this.dataStore.storeTrialJobEvent('USER_TO_CANCEL', trialJobId, ''); await this.dataStore.storeTrialJobEvent('USER_TO_CANCEL', trialJobId, '');
...@@ -180,13 +194,16 @@ class NNIManager implements Manager { ...@@ -180,13 +194,16 @@ class NNIManager implements Manager {
return this.experimentProfile.id; return this.experimentProfile.id;
} }
public async resumeExperiment(): Promise<void> { public async resumeExperiment(readonly: boolean): Promise<void> {
this.log.info(`Resuming experiment: ${this.experimentProfile.id}`); this.log.info(`Resuming experiment: ${this.experimentProfile.id}`);
//Fetch back the experiment profile //Fetch back the experiment profile
const experimentId: string = getExperimentId(); const experimentId: string = getExperimentId();
this.experimentProfile = await this.dataStore.getExperimentProfile(experimentId); this.experimentProfile = await this.dataStore.getExperimentProfile(experimentId);
this.readonly = readonly;
if (readonly) {
return Promise.resolve();
}
const expParams: ExperimentParams = this.experimentProfile.params; const expParams: ExperimentParams = this.experimentProfile.params;
setInitTrialSequenceId(this.experimentProfile.maxSequenceId + 1); setInitTrialSequenceId(this.experimentProfile.maxSequenceId + 1);
// Set up multiphase config // Set up multiphase config
...@@ -196,7 +213,7 @@ class NNIManager implements Manager { ...@@ -196,7 +213,7 @@ class NNIManager implements Manager {
// Set up versionCheck config // Set up versionCheck config
if (expParams.versionCheck !== undefined) { if (expParams.versionCheck !== undefined) {
this.trainingService.setClusterMetadata('versionCheck', expParams.versionCheck.toString()); this.trainingService.setClusterMetadata('version_check', expParams.versionCheck.toString());
} }
const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.advisor, const dispatcherCommand: string = getMsgDispatcherCommand(expParams.tuner, expParams.assessor, expParams.advisor,
...@@ -247,6 +264,9 @@ class NNIManager implements Manager { ...@@ -247,6 +264,9 @@ class NNIManager implements Manager {
} }
public async setClusterMetadata(key: string, value: string): Promise<void> { public async setClusterMetadata(key: string, value: string): Promise<void> {
if (this.readonly) {
return Promise.reject(new Error('Error: can not set cluster metadata in readonly mode!'));
}
this.log.info(`NNIManager setClusterMetadata, key: ${key}, value: ${value}`); this.log.info(`NNIManager setClusterMetadata, key: ${key}, value: ${value}`);
let timeoutId: NodeJS.Timer; let timeoutId: NodeJS.Timer;
// TO DO: move timeout value to constants file // TO DO: move timeout value to constants file
......
...@@ -26,7 +26,7 @@ import * as component from './common/component'; ...@@ -26,7 +26,7 @@ import * as component from './common/component';
import { Database, DataStore } from './common/datastore'; import { Database, DataStore } from './common/datastore';
import { setExperimentStartupInfo } from './common/experimentStartupInfo'; import { setExperimentStartupInfo } from './common/experimentStartupInfo';
import { getLogger, Logger, logLevelNameMap } from './common/log'; import { getLogger, Logger, logLevelNameMap } from './common/log';
import { Manager } from './common/manager'; import { Manager, ExperimentStartUpMode } from './common/manager';
import { TrainingService } from './common/trainingService'; import { TrainingService } from './common/trainingService';
import { getLogDir, mkDirP, parseArg, uniqueString } from './common/utils'; import { getLogDir, mkDirP, parseArg, uniqueString } from './common/utils';
import { NNIDataStore } from './core/nniDataStore'; import { NNIDataStore } from './core/nniDataStore';
...@@ -43,10 +43,10 @@ import { ...@@ -43,10 +43,10 @@ import {
function initStartupInfo( function initStartupInfo(
startExpMode: string, resumeExperimentId: string, basePort: number, startExpMode: string, resumeExperimentId: string, basePort: number,
logDirectory: string, experimentLogLevel: string): void { logDirectory: string, experimentLogLevel: string, readonly: boolean): void {
const createNew: boolean = (startExpMode === 'new'); const createNew: boolean = (startExpMode === ExperimentStartUpMode.NEW);
const expId: string = createNew ? uniqueString(8) : resumeExperimentId; const expId: string = createNew ? uniqueString(8) : resumeExperimentId;
setExperimentStartupInfo(createNew, expId, basePort, logDirectory, experimentLogLevel); setExperimentStartupInfo(createNew, expId, basePort, logDirectory, experimentLogLevel, readonly);
} }
async function initContainer(platformMode: string): Promise<void> { async function initContainer(platformMode: string): Promise<void> {
...@@ -108,15 +108,15 @@ if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller'].includes(mode ...@@ -108,15 +108,15 @@ if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller'].includes(mode
} }
const startMode: string = parseArg(['--start_mode', '-s']); const startMode: string = parseArg(['--start_mode', '-s']);
if (!['new', 'resume'].includes(startMode)) { if (![ExperimentStartUpMode.NEW, ExperimentStartUpMode.RESUME].includes(startMode)) {
console.log(`FATAL: unknown start_mode: ${startMode}`); console.log(`FATAL: unknown start_mode: ${startMode}`);
usage(); usage();
process.exit(1); process.exit(1);
} }
const experimentId: string = parseArg(['--experiment_id', '-id']); const experimentId: string = parseArg(['--experiment_id', '-id']);
if (startMode === 'resume' && experimentId.trim().length < 1) { if ((startMode === ExperimentStartUpMode.RESUME) && experimentId.trim().length < 1) {
console.log(`FATAL: cannot resume experiment, invalid experiment_id: ${experimentId}`); console.log(`FATAL: cannot resume the experiment, invalid experiment_id: ${experimentId}`);
usage(); usage();
process.exit(1); process.exit(1);
} }
...@@ -133,7 +133,15 @@ if (logLevel.length > 0 && !logLevelNameMap.has(logLevel)) { ...@@ -133,7 +133,15 @@ if (logLevel.length > 0 && !logLevelNameMap.has(logLevel)) {
console.log(`FATAL: invalid log_level: ${logLevel}`); console.log(`FATAL: invalid log_level: ${logLevel}`);
} }
initStartupInfo(startMode, experimentId, port, logDir, logLevel); const readonlyArg: string = parseArg(['--readonly', '-r']);
if (!('true' || 'false').includes(readonlyArg.toLowerCase())) {
console.log(`FATAL: readonly property should only be true or false`);
usage();
process.exit(1);
}
const readonly = readonlyArg.toLowerCase() == 'true' ? true : false;
initStartupInfo(startMode, experimentId, port, logDir, logLevel, readonly);
mkDirP(getLogDir()) mkDirP(getLogDir())
.then(async () => { .then(async () => {
......
...@@ -25,9 +25,9 @@ import * as path from 'path'; ...@@ -25,9 +25,9 @@ import * as path from 'path';
import * as component from '../common/component'; import * as component from '../common/component';
import { DataStore, MetricDataRecord, TrialJobInfo } from '../common/datastore'; import { DataStore, MetricDataRecord, TrialJobInfo } from '../common/datastore';
import { NNIError, NNIErrorNames } from '../common/errors'; import { NNIError, NNIErrorNames } from '../common/errors';
import { isNewExperiment } from '../common/experimentStartupInfo'; import { isNewExperiment, isReadonly } from '../common/experimentStartupInfo';
import { getLogger, Logger } from '../common/log'; import { getLogger, Logger } from '../common/log';
import { ExperimentProfile, Manager, TrialJobStatistics} from '../common/manager'; import { ExperimentProfile, Manager, TrialJobStatistics, ExperimentStartUpMode } from '../common/manager';
import { ValidationSchemas } from './restValidationSchemas'; import { ValidationSchemas } from './restValidationSchemas';
import { NNIRestServer } from './nniRestServer'; import { NNIRestServer } from './nniRestServer';
import { getVersion } from '../common/utils'; import { getVersion } from '../common/utils';
...@@ -86,11 +86,11 @@ class NNIRestHandler { ...@@ -86,11 +86,11 @@ class NNIRestHandler {
return router; return router;
} }
private handle_error(err: Error, res: Response, isFatal: boolean = false): void { private handle_error(err: Error, res: Response, isFatal: boolean = false, errorCode: number = 500): void {
if (err instanceof NNIError && err.name === NNIErrorNames.NOT_FOUND) { if (err instanceof NNIError && err.name === NNIErrorNames.NOT_FOUND) {
res.status(404); res.status(404);
} else { } else {
res.status(500); res.status(errorCode);
} }
res.send({ res.send({
error: err.message error: err.message
...@@ -169,13 +169,13 @@ class NNIRestHandler { ...@@ -169,13 +169,13 @@ class NNIRestHandler {
this.handle_error(err, res); this.handle_error(err, res);
}); });
} else { } else {
this.nniManager.resumeExperiment().then(() => { this.nniManager.resumeExperiment(isReadonly()).then(() => {
res.send(); res.send();
}).catch((err: Error) => { }).catch((err: Error) => {
// Resume experiment is a step of initialization, so any exception thrown is a fatal // Resume experiment is a step of initialization, so any exception thrown is a fatal
this.handle_error(err, res); this.handle_error(err, res);
}); });
} }
}); });
} }
...@@ -193,18 +193,18 @@ class NNIRestHandler { ...@@ -193,18 +193,18 @@ class NNIRestHandler {
router.put( router.put(
'/experiment/cluster-metadata', expressJoi(ValidationSchemas.SETCLUSTERMETADATA), '/experiment/cluster-metadata', expressJoi(ValidationSchemas.SETCLUSTERMETADATA),
async (req: Request, res: Response) => { async (req: Request, res: Response) => {
// tslint:disable-next-line:no-any // tslint:disable-next-line:no-any
const metadata: any = req.body; const metadata: any = req.body;
const keys: string[] = Object.keys(metadata); const keys: string[] = Object.keys(metadata);
try { try {
for (const key of keys) { for (const key of keys) {
await this.nniManager.setClusterMetadata(key, JSON.stringify(metadata[key])); await this.nniManager.setClusterMetadata(key, JSON.stringify(metadata[key]));
}
res.send();
} catch (err) {
// setClusterMetata is a step of initialization, so any exception thrown is a fatal
this.handle_error(NNIError.FromError(err), res, true);
} }
res.send();
} catch (err) {
// setClusterMetata is a step of initialization, so any exception thrown is a fatal
this.handle_error(NNIError.FromError(err), res, true);
}
}); });
} }
......
...@@ -118,12 +118,17 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None ...@@ -118,12 +118,17 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None
node_command = 'node' node_command = 'node'
if sys.platform == 'win32': if sys.platform == 'win32':
node_command = os.path.join(entry_dir[:-3], 'Scripts', 'node.exe') node_command = os.path.join(entry_dir[:-3], 'Scripts', 'node.exe')
cmds = [node_command, entry_file, '--port', str(port), '--mode', platform, '--start_mode', mode] cmds = [node_command, entry_file, '--port', str(port), '--mode', platform]
if mode == 'view':
cmds += ['--start_mode', 'resume']
cmds += ['--readonly', 'true']
else:
cmds += ['--start_mode', mode]
if log_dir is not None: if log_dir is not None:
cmds += ['--log_dir', log_dir] cmds += ['--log_dir', log_dir]
if log_level is not None: if log_level is not None:
cmds += ['--log_level', log_level] cmds += ['--log_level', log_level]
if mode == 'resume': if mode in ['resume', 'view']:
cmds += ['--experiment_id', experiment_id] cmds += ['--experiment_id', experiment_id]
stdout_full_path, stderr_full_path = get_log_path(config_file_name) stdout_full_path, stderr_full_path = get_log_path(config_file_name)
with open(stdout_full_path, 'a+') as stdout_file, open(stderr_full_path, 'a+') as stderr_file: with open(stdout_full_path, 'a+') as stdout_file, open(stderr_full_path, 'a+') as stderr_file:
...@@ -156,7 +161,6 @@ def set_trial_config(experiment_config, port, config_file_name): ...@@ -156,7 +161,6 @@ def set_trial_config(experiment_config, port, config_file_name):
def set_local_config(experiment_config, port, config_file_name): def set_local_config(experiment_config, port, config_file_name):
'''set local configuration''' '''set local configuration'''
#set machine_list
request_data = dict() request_data = dict()
if experiment_config.get('localConfig'): if experiment_config.get('localConfig'):
request_data['local_config'] = experiment_config['localConfig'] request_data['local_config'] = experiment_config['localConfig']
...@@ -177,7 +181,7 @@ def set_local_config(experiment_config, port, config_file_name): ...@@ -177,7 +181,7 @@ def set_local_config(experiment_config, port, config_file_name):
fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':')))
return False, err_message return False, err_message
return set_trial_config(experiment_config, port, config_file_name) return set_trial_config(experiment_config, port, config_file_name), None
def set_remote_config(experiment_config, port, config_file_name): def set_remote_config(experiment_config, port, config_file_name):
'''Call setClusterMetadata to pass trial''' '''Call setClusterMetadata to pass trial'''
...@@ -345,7 +349,6 @@ def set_experiment(experiment_config, mode, port, config_file_name): ...@@ -345,7 +349,6 @@ def set_experiment(experiment_config, mode, port, config_file_name):
{'key': 'frameworkcontroller_config', 'value': experiment_config['frameworkcontrollerConfig']}) {'key': 'frameworkcontroller_config', 'value': experiment_config['frameworkcontrollerConfig']})
request_data['clusterMetaData'].append( request_data['clusterMetaData'].append(
{'key': 'trial_config', 'value': experiment_config['trial']}) {'key': 'trial_config', 'value': experiment_config['trial']})
response = rest_post(experiment_url(port), json.dumps(request_data), REST_TIME_OUT, show_error=True) response = rest_post(experiment_url(port), json.dumps(request_data), REST_TIME_OUT, show_error=True)
if check_response(response): if check_response(response):
return response return response
...@@ -357,6 +360,33 @@ def set_experiment(experiment_config, mode, port, config_file_name): ...@@ -357,6 +360,33 @@ def set_experiment(experiment_config, mode, port, config_file_name):
print_error('Setting experiment error, error message is {}'.format(response.text)) print_error('Setting experiment error, error message is {}'.format(response.text))
return None return None
def set_platform_config(platform, experiment_config, port, config_file_name, rest_process):
'''call set_cluster_metadata for specific platform'''
print_normal('Setting {0} config...'.format(platform))
config_result, err_msg = None, None
if platform == 'local':
config_result, err_msg = set_local_config(experiment_config, port, config_file_name)
elif platform == 'remote':
config_result, err_msg = set_remote_config(experiment_config, port, config_file_name)
elif platform == 'pai':
config_result, err_msg = set_pai_config(experiment_config, port, config_file_name)
elif platform == 'kubeflow':
config_result, err_msg = set_kubeflow_config(experiment_config, port, config_file_name)
elif platform == 'frameworkcontroller':
config_result, err_msg = set_frameworkcontroller_config(experiment_config, port, config_file_name)
else:
raise Exception(ERROR_INFO % 'Unsupported platform!')
exit(1)
if config_result:
print_normal('Successfully set {0} config!'.format(platform))
else:
print_error('Failed! Error is: {}'.format(err_msg))
try:
kill_command(rest_process.pid)
except Exception:
raise Exception(ERROR_INFO % 'Rest server stopped!')
exit(1)
def launch_experiment(args, experiment_config, mode, config_file_name, experiment_id=None): def launch_experiment(args, experiment_config, mode, config_file_name, experiment_id=None):
'''follow steps to start rest server and start experiment''' '''follow steps to start rest server and start experiment'''
nni_config = Config(config_file_name) nni_config = Config(config_file_name)
...@@ -381,8 +411,10 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen ...@@ -381,8 +411,10 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
exit(1) exit(1)
log_dir = experiment_config['logDir'] if experiment_config.get('logDir') else None log_dir = experiment_config['logDir'] if experiment_config.get('logDir') else None
log_level = experiment_config['logLevel'] if experiment_config.get('logLevel') else None log_level = experiment_config['logLevel'] if experiment_config.get('logLevel') else None
if log_level not in ['trace', 'debug'] and (args.debug or experiment_config.get('debug') is True): #view experiment mode do not need debug function, when view an experiment, there will be no new logs created
log_level = 'debug' if mode != 'view':
if log_level not in ['trace', 'debug'] and (args.debug or experiment_config.get('debug') is True):
log_level = 'debug'
# start rest server # start rest server
rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id, log_dir, log_level) rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, config_file_name, experiment_id, log_dir, log_level)
nni_config.set_config('restServerPid', rest_process.pid) nni_config.set_config('restServerPid', rest_process.pid)
...@@ -416,83 +448,14 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen ...@@ -416,83 +448,14 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
except Exception: except Exception:
raise Exception(ERROR_INFO % 'Rest server stopped!') raise Exception(ERROR_INFO % 'Rest server stopped!')
exit(1) exit(1)
if mode != 'view':
# set remote config # set platform configuration
if experiment_config['trainingServicePlatform'] == 'remote': set_platform_config(experiment_config['trainingServicePlatform'], experiment_config, args.port, config_file_name, rest_process)
print_normal('Setting remote config...')
config_result, err_msg = set_remote_config(experiment_config, args.port, config_file_name)
if config_result:
print_normal('Successfully set remote config!')
else:
print_error('Failed! Error is: {}'.format(err_msg))
try:
kill_command(rest_process.pid)
except Exception:
raise Exception(ERROR_INFO % 'Rest server stopped!')
exit(1)
# set local config
if experiment_config['trainingServicePlatform'] == 'local':
print_normal('Setting local config...')
if set_local_config(experiment_config, args.port, config_file_name):
print_normal('Successfully set local config!')
else:
print_error('Set local config failed!')
try:
kill_command(rest_process.pid)
except Exception:
raise Exception(ERROR_INFO % 'Rest server stopped!')
exit(1)
#set pai config
if experiment_config['trainingServicePlatform'] == 'pai':
print_normal('Setting pai config...')
config_result, err_msg = set_pai_config(experiment_config, args.port, config_file_name)
if config_result:
print_normal('Successfully set pai config!')
else:
if err_msg:
print_error('Failed! Error is: {}'.format(err_msg))
try:
kill_command(rest_process.pid)
except Exception:
raise Exception(ERROR_INFO % 'Restful server stopped!')
exit(1)
#set kubeflow config
if experiment_config['trainingServicePlatform'] == 'kubeflow':
print_normal('Setting kubeflow config...')
config_result, err_msg = set_kubeflow_config(experiment_config, args.port, config_file_name)
if config_result:
print_normal('Successfully set kubeflow config!')
else:
if err_msg:
print_error('Failed! Error is: {}'.format(err_msg))
try:
kill_command(rest_process.pid)
except Exception:
raise Exception(ERROR_INFO % 'Restful server stopped!')
exit(1)
#set frameworkcontroller config
if experiment_config['trainingServicePlatform'] == 'frameworkcontroller':
print_normal('Setting frameworkcontroller config...')
config_result, err_msg = set_frameworkcontroller_config(experiment_config, args.port, config_file_name)
if config_result:
print_normal('Successfully set frameworkcontroller config!')
else:
if err_msg:
print_error('Failed! Error is: {}'.format(err_msg))
try:
kill_command(rest_process.pid)
except Exception:
raise Exception(ERROR_INFO % 'Restful server stopped!')
exit(1)
# start a new experiment # start a new experiment
print_normal('Starting experiment...') print_normal('Starting experiment...')
# set debug configuration # set debug configuration
if experiment_config.get('debug') is None: if mode != 'view' and experiment_config.get('debug') is None:
experiment_config['debug'] = args.debug experiment_config['debug'] = args.debug
response = set_experiment(experiment_config, mode, args.port, config_file_name) response = set_experiment(experiment_config, mode, args.port, config_file_name)
if response: if response:
...@@ -519,8 +482,23 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen ...@@ -519,8 +482,23 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, ' '.join(web_ui_url_list))) print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, ' '.join(web_ui_url_list)))
def resume_experiment(args): def create_experiment(args):
'''resume an experiment''' '''start a new experiment'''
config_file_name = ''.join(random.sample(string.ascii_letters + string.digits, 8))
nni_config = Config(config_file_name)
config_path = os.path.abspath(args.config)
if not os.path.exists(config_path):
print_error('Please set correct config path!')
exit(1)
experiment_config = get_yml_content(config_path)
validate_all_content(experiment_config, config_path)
nni_config.set_config('experimentConfig', experiment_config)
launch_experiment(args, experiment_config, 'new', config_file_name)
nni_config.set_config('restServerPort', args.port)
def manage_stopped_experiment(args, mode):
'''view a stopped experiment'''
update_experiment() update_experiment()
experiment_config = Experiments() experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments() experiment_dict = experiment_config.get_all_experiments()
...@@ -528,38 +506,31 @@ def resume_experiment(args): ...@@ -528,38 +506,31 @@ def resume_experiment(args):
experiment_endTime = None experiment_endTime = None
#find the latest stopped experiment #find the latest stopped experiment
if not args.id: if not args.id:
print_error('Please set experiment id! \nYou could use \'nnictl resume {id}\' to resume a stopped experiment!\n' \ print_error('Please set experiment id! \nYou could use \'nnictl {0} {id}\' to {0} a stopped experiment!\n' \
'You could use \'nnictl experiment list --all\' to show all experiments!') 'You could use \'nnictl experiment list --all\' to show all experiments!'.format(mode))
exit(1) exit(1)
else: else:
if experiment_dict.get(args.id) is None: if experiment_dict.get(args.id) is None:
print_error('Id %s not exist!' % args.id) print_error('Id %s not exist!' % args.id)
exit(1) exit(1)
if experiment_dict[args.id]['status'] != 'STOPPED': if experiment_dict[args.id]['status'] != 'STOPPED':
print_error('Only stopped experiments can be resumed!') print_error('Only stopped experiments can be {0}ed!'.format(mode))
exit(1) exit(1)
experiment_id = args.id experiment_id = args.id
print_normal('Resuming experiment %s...' % experiment_id) print_normal('{0} experiment {1}...'.format(mode, experiment_id))
nni_config = Config(experiment_dict[experiment_id]['fileName']) nni_config = Config(experiment_dict[experiment_id]['fileName'])
experiment_config = nni_config.get_config('experimentConfig') experiment_config = nni_config.get_config('experimentConfig')
experiment_id = nni_config.get_config('experimentId') experiment_id = nni_config.get_config('experimentId')
new_config_file_name = ''.join(random.sample(string.ascii_letters + string.digits, 8)) new_config_file_name = ''.join(random.sample(string.ascii_letters + string.digits, 8))
new_nni_config = Config(new_config_file_name) new_nni_config = Config(new_config_file_name)
new_nni_config.set_config('experimentConfig', experiment_config) new_nni_config.set_config('experimentConfig', experiment_config)
launch_experiment(args, experiment_config, 'resume', new_config_file_name, experiment_id) launch_experiment(args, experiment_config, mode, new_config_file_name, experiment_id)
new_nni_config.set_config('restServerPort', args.port) new_nni_config.set_config('restServerPort', args.port)
def create_experiment(args): def view_experiment(args):
'''start a new experiment''' '''view a stopped experiment'''
config_file_name = ''.join(random.sample(string.ascii_letters + string.digits, 8)) manage_stopped_experiment(args, 'view')
nni_config = Config(config_file_name)
config_path = os.path.abspath(args.config)
if not os.path.exists(config_path):
print_error('Please set correct config path!')
exit(1)
experiment_config = get_yml_content(config_path)
validate_all_content(experiment_config, config_path)
nni_config.set_config('experimentConfig', experiment_config) def resume_experiment(args):
launch_experiment(args, experiment_config, 'new', config_file_name) '''resume an experiment'''
nni_config.set_config('restServerPort', args.port) manage_stopped_experiment(args, 'resume')
\ No newline at end of file
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
import argparse import argparse
import pkg_resources import pkg_resources
from .launcher import create_experiment, resume_experiment from .launcher import create_experiment, resume_experiment, view_experiment
from .updater import update_searchspace, update_concurrency, update_duration, update_trialnum, import_data from .updater import update_searchspace, update_concurrency, update_duration, update_trialnum, import_data
from .nnictl_utils import * from .nnictl_utils import *
from .package_management import * from .package_management import *
...@@ -66,6 +66,12 @@ def parse_args(): ...@@ -66,6 +66,12 @@ def parse_args():
parser_resume.add_argument('--debug', '-d', action='store_true', help=' set debug mode') parser_resume.add_argument('--debug', '-d', action='store_true', help=' set debug mode')
parser_resume.set_defaults(func=resume_experiment) parser_resume.set_defaults(func=resume_experiment)
# parse view command
parser_resume = subparsers.add_parser('view', help='view a stopped experiment')
parser_resume.add_argument('id', nargs='?', help='The id of the experiment you want to view')
parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server')
parser_resume.set_defaults(func=view_experiment)
# parse update command # parse update command
parser_updater = subparsers.add_parser('update', help='update the experiment') parser_updater = subparsers.add_parser('update', help='update the experiment')
#add subparsers for parser_updater #add subparsers for parser_updater
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment