"docs/_removed/TrialExample/MnistExamples.rst" did not exist on "a441558c7b79fa0feaf4868b4b8fa1d66b4120c1"
Commit b40e3db7 authored by quzha's avatar quzha
Browse files

Merge branch 'master' of github.com:Microsoft/nni into dev-retiarii

parents efa4e31c 95f731e4
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'use strict';
abstract class ExperimentManager {
public abstract getExperimentsInfo(): Promise<JSON>;
public abstract setExperimentPath(newPath: string): void;
public abstract setExperimentInfo(experimentId: string, key: string, value: any): void;
public abstract stop(): Promise<void>;
}
export {ExperimentManager};
...@@ -105,6 +105,7 @@ abstract class Manager { ...@@ -105,6 +105,7 @@ abstract class Manager {
public abstract getTrialLog(trialJobId: string, logType: LogType): Promise<string>; public abstract getTrialLog(trialJobId: string, logType: LogType): Promise<string>;
public abstract getTrialJobStatistics(): Promise<TrialJobStatistics[]>; public abstract getTrialJobStatistics(): Promise<TrialJobStatistics[]>;
public abstract getTrialJobMessage(trialJobId: string): string | undefined;
public abstract getStatus(): NNIManagerStatus; public abstract getStatus(): NNIManagerStatus;
} }
......
...@@ -42,6 +42,7 @@ interface TrialJobDetail { ...@@ -42,6 +42,7 @@ interface TrialJobDetail {
readonly workingDirectory: string; readonly workingDirectory: string;
readonly form: TrialJobApplicationForm; readonly form: TrialJobApplicationForm;
isEarlyStopped?: boolean; isEarlyStopped?: boolean;
message?: string;
} }
/** /**
......
...@@ -11,13 +11,16 @@ import { ChildProcess, spawn, StdioOptions } from 'child_process'; ...@@ -11,13 +11,16 @@ import { ChildProcess, spawn, StdioOptions } from 'child_process';
import * as fs from 'fs'; import * as fs from 'fs';
import * as os from 'os'; import * as os from 'os';
import * as path from 'path'; import * as path from 'path';
import * as lockfile from 'lockfile';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import { Container } from 'typescript-ioc'; import { Container } from 'typescript-ioc';
import * as util from 'util'; import * as util from 'util';
import * as glob from 'glob';
import { Database, DataStore } from './datastore'; import { Database, DataStore } from './datastore';
import { ExperimentStartupInfo, getExperimentStartupInfo, setExperimentStartupInfo } from './experimentStartupInfo'; import { ExperimentStartupInfo, getExperimentStartupInfo, setExperimentStartupInfo } from './experimentStartupInfo';
import { ExperimentParams, Manager } from './manager'; import { ExperimentParams, Manager } from './manager';
import { ExperimentManager } from './experimentManager';
import { HyperParameters, TrainingService, TrialJobStatus } from './trainingService'; import { HyperParameters, TrainingService, TrialJobStatus } from './trainingService';
import { logLevelNameMap } from './log'; import { logLevelNameMap } from './log';
...@@ -43,6 +46,10 @@ function getCheckpointDir(): string { ...@@ -43,6 +46,10 @@ function getCheckpointDir(): string {
return path.join(getExperimentRootDir(), 'checkpoint'); return path.join(getExperimentRootDir(), 'checkpoint');
} }
function getExperimentsInfoPath(): string {
return path.join(os.homedir(), 'nni-experiments', '.experiment');
}
function mkDirP(dirPath: string): Promise<void> { function mkDirP(dirPath: string): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
fs.exists(dirPath, (exists: boolean) => { fs.exists(dirPath, (exists: boolean) => {
...@@ -184,6 +191,7 @@ function prepareUnitTest(): void { ...@@ -184,6 +191,7 @@ function prepareUnitTest(): void {
Container.snapshot(DataStore); Container.snapshot(DataStore);
Container.snapshot(TrainingService); Container.snapshot(TrainingService);
Container.snapshot(Manager); Container.snapshot(Manager);
Container.snapshot(ExperimentManager);
const logLevel: string = parseArg(['--log_level', '-ll']); const logLevel: string = parseArg(['--log_level', '-ll']);
if (logLevel.length > 0 && !logLevelNameMap.has(logLevel)) { if (logLevel.length > 0 && !logLevelNameMap.has(logLevel)) {
...@@ -211,6 +219,7 @@ function cleanupUnitTest(): void { ...@@ -211,6 +219,7 @@ function cleanupUnitTest(): void {
Container.restore(DataStore); Container.restore(DataStore);
Container.restore(Database); Container.restore(Database);
Container.restore(ExperimentStartupInfo); Container.restore(ExperimentStartupInfo);
Container.restore(ExperimentManager);
} }
let cachedipv4Address: string = ''; let cachedipv4Address: string = '';
...@@ -416,8 +425,29 @@ function unixPathJoin(...paths: any[]): string { ...@@ -416,8 +425,29 @@ function unixPathJoin(...paths: any[]): string {
return dir; return dir;
} }
/**
* lock a file sync
*/
function withLockSync(func: Function, filePath: string, lockOpts: {[key: string]: any}, ...args: any): any {
const lockName = path.join(path.dirname(filePath), path.basename(filePath) + `.lock.${process.pid}`);
if (typeof lockOpts.stale === 'number'){
const lockPath = path.join(path.dirname(filePath), path.basename(filePath) + '.lock.*');
const lockFileNames: string[] = glob.sync(lockPath);
const canLock: boolean = lockFileNames.map((fileName) => {
return fs.existsSync(fileName) && Date.now() - fs.statSync(fileName).mtimeMs > lockOpts.stale;
}).filter(isExpired=>isExpired === false).length === 0;
if (!canLock) {
throw new Error('File has been locked.');
}
}
lockfile.lockSync(lockName, lockOpts);
const result = func(...args);
lockfile.unlockSync(lockName);
return result;
}
export { export {
countFilesRecursively, validateFileNameRecursively, generateParamFileName, getMsgDispatcherCommand, getCheckpointDir, countFilesRecursively, validateFileNameRecursively, generateParamFileName, getMsgDispatcherCommand, getCheckpointDir, getExperimentsInfoPath,
getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address, unixPathJoin, getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address, unixPathJoin, withLockSync,
mkDirP, mkDirPSync, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomInt, randomSelect, getLogLevel, getVersion, getCmdPy, getTunerProc, isAlive, killPid, getNewLine mkDirP, mkDirPSync, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomInt, randomSelect, getLogLevel, getVersion, getCmdPy, getTunerProc, isAlive, killPid, getNewLine
}; };
{
"apiVersion": "apiextensions.k8s.io/v1beta1",
"kind": "CustomResourceDefinition",
"metadata": {
"name": "adaptdljobs.adaptdl.petuum.com"
},
"spec": {
"group": "adaptdl.petuum.com",
"version": "v1",
"scope": "Namespaced",
"names": {
"plural": "adaptdljobs",
"singular": "adaptdljob",
"kind": "AdaptDLJob"
}
}
}
{
"apiVersion": "v1",
"kind": "ConfigMap",
"metadata": {
"name": "<name>",
"ownerReferences": [
{
"apiVersion": "adaptdl.petuum.com/v1",
"kind": "AdaptDLJob",
"name": "<adaptdljob_name>",
"uid": "<adaptdljob_uid>"
}
]
},
"data": {
"run.sh": "<run_script>",
"cleanup.sh": "<clean_script>"
}
}
{
"apiVersion": "v1",
"kind": "PersistentVolumeClaim",
"metadata": {
"name": "<name>",
"ownerReferences": [
{
"apiVersion": "adaptdl.petuum.com/v1",
"kind": "AdaptDLJob",
"name": "<adaptdljob_name>",
"uid": "<adaptdljob_uid>"
}
]
},
"spec": {
"accessModes": [
"ReadWriteMany"
],
"resources": {
"requests": {
"storage": "<storage_size>"
}
},
"storageClassName": "<storage_class>",
"volumeMode": "Filesystem"
}
}
{
"apiVersion": "apps/v1",
"kind": "Deployment",
"metadata": {
"name": "<name>",
"labels": {
"expId": "<exp_id>"
}
},
"spec": {
"selector": {
"matchLabels": {
"app": "<name>"
}
},
"replicas": 1,
"template": {
"metadata": {
"labels": {
"app": "<name>"
}
},
"spec": {
"containers": [
{
"command": ["tensorboard"],
"args": ["--host=0.0.0.0", "--logdir=/adaptdl/tensorboard", "--port=6006"],
"image": "tensorflow/tensorflow",
"name": "tensorboard",
"ports": [
{
"containerPort": 6006
}
],
"volumeMounts": [
{
"mountPath": "/adaptdl/tensorboard",
"name": "adaptdl-tensorboard-pvc",
"subPath": "adaptdl/tensorboard"
}
]
}
],
"volumes": [
{
"name": "adaptdl-tensorboard-pvc",
"persistentVolumeClaim": {
"claimName": "<adaptdl_tensorflow_pvc_name>"
}
}
]
}
}
}
}
\ No newline at end of file
{
"apiVersion": "v1",
"kind": "PersistentVolumeClaim",
"metadata": {
"name": "<name>",
"ownerReferences": [
{
"apiVersion": "apps/v1",
"kind": "Deployment",
"name": "<adaptdl_tensorboard_name>",
"uid": "<adaptdl_tensorboard_uid>"
}
]
},
"spec": {
"accessModes": [
"ReadWriteMany"
],
"resources": {
"requests": {
"storage": "<storage_size>"
}
},
"storageClassName": "<storage_class>",
"volumeMode": "Filesystem"
}
}
{
"apiVersion": "adaptdl.petuum.com/v1",
"kind": "AdaptDLJob",
"metadata": {
"name": "<name>",
"labels": {
"app": "<app_name>",
"expId": "<exp_id>",
"trialId": "<trial_id>"
}
},
"spec": {
"preemptible": false,
"template": {
"spec": {
"containers": [
{
"lifecycle":
{
"preStop":
{
"exec":
{
"command": ["/cleanup.sh"]
}
}
},
"command": ["/run.sh"],
"env": [
{
"name": "ADAPTDL_CHECKPOINT_PATH",
"value": "/adaptdl/checkpoint"
},
{
"name": "ADAPTDL_TENSORBOARD_LOGDIR",
"value": "/adaptdl/tensorboard"
},
{
"name": "ADAPTDL_SHARE_PATH",
"value": "/adaptdl/share"
}
],
"image": "<image>",
"imagePullPolicy": "Always",
"name": "main",
"resources": {
"requests": {
"memory": "<memorySize>",
"cpu": "<cpuNum>"
},
"limits": {
"nvidia.com/gpu": 1
}
},
"volumeMounts": [
{
"mountPath": "/adaptdl/checkpoint",
"name": "adaptdl-pvc",
"subPath": "adaptdl/checkpoint"
},
{
"mountPath": "/adaptdl/share",
"name": "adaptdl-pvc",
"subPath": "adaptdl/share"
},
{
"mountPath": "/adaptdl/tensorboard",
"name": "adaptdl-tensorboard-pvc",
"subPath": "adaptdl/tensorboard"
},
{
"mountPath": "/cleanup.sh",
"name": "adaptdl-nni-configmap",
"subPath": "cleanup.sh"
},
{
"mountPath": "/run.sh",
"name": "adaptdl-nni-configmap",
"subPath": "run.sh"
}
]
}
],
"imagePullSecrets": [],
"volumes": [
{
"name": "adaptdl-pvc",
"persistentVolumeClaim": {
"claimName": "<adaptdl_pvc_name>"
}
},
{
"name": "adaptdl-tensorboard-pvc",
"persistentVolumeClaim": {
"claimName": "<adaptdl_tensorflow_pvc_name>"
}
},
{
"name": "adaptdl-nni-configmap",
"configMap": {
"name": "<adaptdl_nni_configmap_name>",
"defaultMode": 511
}
}
]
}
}
}
}
...@@ -168,7 +168,7 @@ class NNIDataStore implements DataStore { ...@@ -168,7 +168,7 @@ class NNIDataStore implements DataStore {
const oneEntry: ExportedDataFormat = { const oneEntry: ExportedDataFormat = {
parameter: parameters.parameters, parameter: parameters.parameters,
value: JSON.parse(job.finalMetricData[0].data), value: JSON.parse(job.finalMetricData[0].data),
id: job.id trialJobId: job.trialJobId
}; };
exportedData.push(oneEntry); exportedData.push(oneEntry);
} else { } else {
...@@ -188,7 +188,7 @@ class NNIDataStore implements DataStore { ...@@ -188,7 +188,7 @@ class NNIDataStore implements DataStore {
const oneEntry: ExportedDataFormat = { const oneEntry: ExportedDataFormat = {
parameter: value, parameter: value,
value: metricValue, value: metricValue,
id: job.id trialJobId: job.trialJobId
}; };
exportedData.push(oneEntry); exportedData.push(oneEntry);
} }
...@@ -229,7 +229,7 @@ class NNIDataStore implements DataStore { ...@@ -229,7 +229,7 @@ class NNIDataStore implements DataStore {
} }
if (!(status !== undefined && jobInfo.status !== status)) { if (!(status !== undefined && jobInfo.status !== status)) {
if (jobInfo.status === 'SUCCEEDED') { if (jobInfo.status === 'SUCCEEDED') {
jobInfo.finalMetricData = finalMetricsMap.get(jobInfo.id); jobInfo.finalMetricData = finalMetricsMap.get(jobInfo.trialJobId);
} }
result.push(jobInfo); result.push(jobInfo);
} }
...@@ -320,7 +320,7 @@ class NNIDataStore implements DataStore { ...@@ -320,7 +320,7 @@ class NNIDataStore implements DataStore {
jobInfo = map.get(record.trialJobId); jobInfo = map.get(record.trialJobId);
} else { } else {
jobInfo = { jobInfo = {
id: record.trialJobId, trialJobId: record.trialJobId,
status: this.getJobStatusByLatestEvent('UNKNOWN', record.event), status: this.getJobStatusByLatestEvent('UNKNOWN', record.event),
hyperParameters: [] hyperParameters: []
}; };
...@@ -364,14 +364,14 @@ class NNIDataStore implements DataStore { ...@@ -364,14 +364,14 @@ class NNIDataStore implements DataStore {
const newHParam: any = this.parseHyperParameter(record.data); const newHParam: any = this.parseHyperParameter(record.data);
if (newHParam !== undefined) { if (newHParam !== undefined) {
if (jobInfo.hyperParameters !== undefined) { if (jobInfo.hyperParameters !== undefined) {
let hParamIds: Set<number> | undefined = hParamIdMap.get(jobInfo.id); let hParamIds: Set<number> | undefined = hParamIdMap.get(jobInfo.trialJobId);
if (hParamIds === undefined) { if (hParamIds === undefined) {
hParamIds = new Set(); hParamIds = new Set();
} }
if (!hParamIds.has(newHParam.parameter_index)) { if (!hParamIds.has(newHParam.parameter_index)) {
jobInfo.hyperParameters.push(JSON.stringify(newHParam)); jobInfo.hyperParameters.push(JSON.stringify(newHParam));
hParamIds.add(newHParam.parameter_index); hParamIds.add(newHParam.parameter_index);
hParamIdMap.set(jobInfo.id, hParamIds); hParamIdMap.set(jobInfo.trialJobId, hParamIds);
} }
} else { } else {
assert(false, 'jobInfo.hyperParameters is undefined'); assert(false, 'jobInfo.hyperParameters is undefined');
......
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'use strict';
import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';
import * as assert from 'assert';
import { getLogger, Logger } from '../common/log';
import { isAlive, withLockSync, getExperimentsInfoPath, delay } from '../common/utils';
import { ExperimentManager } from '../common/experimentManager';
import { Deferred } from 'ts-deferred';
interface CrashedInfo {
experimentId: string;
isCrashed: boolean;
}
interface FileInfo {
buffer: Buffer;
mtime: number;
}
class NNIExperimentsManager implements ExperimentManager {
private experimentsPath: string;
private log: Logger;
private profileUpdateTimer: {[key: string]: any};
constructor() {
this.experimentsPath = getExperimentsInfoPath();
this.log = getLogger();
this.profileUpdateTimer = {};
}
public async getExperimentsInfo(): Promise<JSON> {
const fileInfo: FileInfo = await this.withLockIterated(this.readExperimentsInfo, 100);
const experimentsInformation = JSON.parse(fileInfo.buffer.toString());
const expIdList: Array<string> = Object.keys(experimentsInformation).filter((expId) => {
return experimentsInformation[expId]['status'] !== 'STOPPED';
});
const updateList: Array<CrashedInfo> = (await Promise.all(expIdList.map((expId) => {
return this.checkCrashed(expId, experimentsInformation[expId]['pid']);
}))).filter(crashedInfo => crashedInfo.isCrashed);
if (updateList.length > 0){
const result = await this.withLockIterated(this.updateAllStatus, 100, updateList.map(crashedInfo => crashedInfo.experimentId), fileInfo.mtime);
if (result !== undefined) {
return JSON.parse(JSON.stringify(Object.keys(result).map(key=>result[key])));
} else {
await delay(500);
return await this.getExperimentsInfo();
}
} else {
return JSON.parse(JSON.stringify(Object.keys(experimentsInformation).map(key=>experimentsInformation[key])));
}
}
public setExperimentPath(newPath: string): void {
if (newPath[0] === '~') {
newPath = path.join(os.homedir(), newPath.slice(1));
}
if (!path.isAbsolute(newPath)) {
newPath = path.resolve(newPath);
}
this.log.info(`Set new experiment information path: ${newPath}`);
this.experimentsPath = newPath;
}
public setExperimentInfo(experimentId: string, key: string, value: any): void {
try {
if (this.profileUpdateTimer[key] !== undefined) {
// if a new call with the same timerId occurs, destroy the unfinished old one
clearTimeout(this.profileUpdateTimer[key]);
this.profileUpdateTimer[key] = undefined;
}
this.withLockSync(() => {
const experimentsInformation = JSON.parse(fs.readFileSync(this.experimentsPath).toString());
assert(experimentId in experimentsInformation, `Experiment Manager: Experiment Id ${experimentId} not found, this should not happen`);
experimentsInformation[experimentId][key] = value;
fs.writeFileSync(this.experimentsPath, JSON.stringify(experimentsInformation, null, 4));
});
} catch (err) {
this.log.error(err);
this.log.debug(`Experiment Manager: Retry set key value: ${experimentId} {${key}: ${value}}`);
if (err.code === 'EEXIST' || err.message === 'File has been locked.') {
this.profileUpdateTimer[key] = setTimeout(this.setExperimentInfo.bind(this), 100, experimentId, key, value);
}
}
}
private async withLockIterated (func: Function, retry: number, ...args: any): Promise<any> {
if (retry < 0) {
throw new Error('Lock file out of retries.');
}
try {
return this.withLockSync(func, ...args);
} catch(err) {
if (err.code === 'EEXIST' || err.message === 'File has been locked.') {
// retry wait is 50ms
await delay(50);
return await this.withLockIterated(func, retry - 1, ...args);
}
throw err;
}
}
private withLockSync (func: Function, ...args: any): any {
return withLockSync(func.bind(this), this.experimentsPath, {stale: 2 * 1000}, ...args);
}
private readExperimentsInfo(): FileInfo {
const buffer: Buffer = fs.readFileSync(this.experimentsPath);
const mtime: number = fs.statSync(this.experimentsPath).mtimeMs;
return {buffer: buffer, mtime: mtime};
}
private async checkCrashed(expId: string, pid: number): Promise<CrashedInfo> {
const alive: boolean = await isAlive(pid);
return {experimentId: expId, isCrashed: !alive}
}
private updateAllStatus(updateList: Array<string>, timestamp: number): {[key: string]: any} | undefined {
if (timestamp !== fs.statSync(this.experimentsPath).mtimeMs) {
return;
} else {
const experimentsInformation = JSON.parse(fs.readFileSync(this.experimentsPath).toString());
updateList.forEach((expId: string) => {
if (experimentsInformation[expId]) {
experimentsInformation[expId]['status'] = 'STOPPED';
} else {
this.log.error(`Experiment Manager: Experiment Id ${expId} not found, this should not happen`);
}
});
fs.writeFileSync(this.experimentsPath, JSON.stringify(experimentsInformation, null, 4));
return experimentsInformation;
}
}
public async stop(): Promise<void> {
this.log.debug('Stopping experiment manager.');
await this.cleanUp().catch(err=>this.log.error(err.message));
this.log.debug('Experiment manager stopped.');
}
private async cleanUp(): Promise<void> {
const deferred = new Deferred<void>();
if (this.isUndone()) {
this.log.debug('Experiment manager: something undone');
setTimeout(((deferred: Deferred<void>): void => {
if (this.isUndone()) {
deferred.reject(new Error('Still has undone after 5s, forced stop.'));
} else {
deferred.resolve();
}
}).bind(this), 5 * 1000, deferred);
} else {
this.log.debug('Experiment manager: all clean up');
deferred.resolve();
}
return deferred.promise;
}
private isUndone(): boolean {
return Object.keys(this.profileUpdateTimer).filter((key: string) => {
return this.profileUpdateTimer[key] !== undefined;
}).length > 0;
}
}
export { NNIExperimentsManager };
...@@ -15,6 +15,7 @@ import { ...@@ -15,6 +15,7 @@ import {
ExperimentParams, ExperimentProfile, Manager, ExperimentStatus, ExperimentParams, ExperimentProfile, Manager, ExperimentStatus,
NNIManagerStatus, ProfileUpdateType, TrialJobStatistics NNIManagerStatus, ProfileUpdateType, TrialJobStatistics
} from '../common/manager'; } from '../common/manager';
import { ExperimentManager } from '../common/experimentManager';
import { import {
TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus, LogType TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus, LogType
} from '../common/trainingService'; } from '../common/trainingService';
...@@ -31,6 +32,7 @@ import { createDispatcherInterface, IpcInterface } from './ipcInterface'; ...@@ -31,6 +32,7 @@ import { createDispatcherInterface, IpcInterface } from './ipcInterface';
class NNIManager implements Manager { class NNIManager implements Manager {
private trainingService: TrainingService; private trainingService: TrainingService;
private dispatcher: IpcInterface | undefined; private dispatcher: IpcInterface | undefined;
private experimentManager: ExperimentManager;
private currSubmittedTrialNum: number; // need to be recovered private currSubmittedTrialNum: number; // need to be recovered
private trialConcurrencyChange: number; // >0: increase, <0: decrease private trialConcurrencyChange: number; // >0: increase, <0: decrease
private log: Logger; private log: Logger;
...@@ -49,6 +51,7 @@ class NNIManager implements Manager { ...@@ -49,6 +51,7 @@ class NNIManager implements Manager {
this.currSubmittedTrialNum = 0; this.currSubmittedTrialNum = 0;
this.trialConcurrencyChange = 0; this.trialConcurrencyChange = 0;
this.trainingService = component.get(TrainingService); this.trainingService = component.get(TrainingService);
this.experimentManager = component.get(ExperimentManager);
assert(this.trainingService); assert(this.trainingService);
this.dispatcherPid = 0; this.dispatcherPid = 0;
this.waitingTrials = []; this.waitingTrials = [];
...@@ -231,7 +234,7 @@ class NNIManager implements Manager { ...@@ -231,7 +234,7 @@ class NNIManager implements Manager {
// Check the final status for WAITING and RUNNING jobs // Check the final status for WAITING and RUNNING jobs
await Promise.all(allTrialJobs await Promise.all(allTrialJobs
.filter((job: TrialJobInfo) => job.status === 'WAITING' || job.status === 'RUNNING') .filter((job: TrialJobInfo) => job.status === 'WAITING' || job.status === 'RUNNING')
.map((job: TrialJobInfo) => this.dataStore.storeTrialJobEvent('FAILED', job.id))); .map((job: TrialJobInfo) => this.dataStore.storeTrialJobEvent('FAILED', job.trialJobId)));
// Collect generated trials and imported trials // Collect generated trials and imported trials
const finishedTrialData: string = await this.exportData(); const finishedTrialData: string = await this.exportData();
...@@ -304,7 +307,7 @@ class NNIManager implements Manager { ...@@ -304,7 +307,7 @@ class NNIManager implements Manager {
// FIXME: can this be undefined? // FIXME: can this be undefined?
trial.sequenceId !== undefined && minSeqId <= trial.sequenceId && trial.sequenceId <= maxSeqId trial.sequenceId !== undefined && minSeqId <= trial.sequenceId && trial.sequenceId <= maxSeqId
)); ));
const targetTrialIds = new Set(targetTrials.map(trial => trial.id)); const targetTrialIds = new Set(targetTrials.map(trial => trial.trialJobId));
const allMetrics = await this.dataStore.getMetricData(); const allMetrics = await this.dataStore.getMetricData();
return allMetrics.filter(metric => targetTrialIds.has(metric.trialJobId)); return allMetrics.filter(metric => targetTrialIds.has(metric.trialJobId));
...@@ -345,6 +348,14 @@ class NNIManager implements Manager { ...@@ -345,6 +348,14 @@ class NNIManager implements Manager {
return this.status; return this.status;
} }
public getTrialJobMessage(trialJobId: string): string | undefined {
const trialJob = this.trialJobs.get(trialJobId);
if (trialJob !== undefined){
return trialJob.message
}
return undefined
}
public async listTrialJobs(status?: TrialJobStatus): Promise<TrialJobInfo[]> { public async listTrialJobs(status?: TrialJobStatus): Promise<TrialJobInfo[]> {
return this.dataStore.listTrialJobs(status); return this.dataStore.listTrialJobs(status);
} }
...@@ -459,7 +470,9 @@ class NNIManager implements Manager { ...@@ -459,7 +470,9 @@ class NNIManager implements Manager {
} }
} }
await this.trainingService.cleanUp(); await this.trainingService.cleanUp();
this.experimentProfile.endTime = Date.now(); if (this.experimentProfile.endTime === undefined) {
this.setEndtime();
}
await this.storeExperimentProfile(); await this.storeExperimentProfile();
this.setStatus('STOPPED'); this.setStatus('STOPPED');
} }
...@@ -501,6 +514,10 @@ class NNIManager implements Manager { ...@@ -501,6 +514,10 @@ class NNIManager implements Manager {
this.trialJobs.set(trialJobId, Object.assign({}, trialJobDetail)); this.trialJobs.set(trialJobId, Object.assign({}, trialJobDetail));
await this.dataStore.storeTrialJobEvent(trialJobDetail.status, trialJobDetail.id, undefined, trialJobDetail); await this.dataStore.storeTrialJobEvent(trialJobDetail.status, trialJobDetail.id, undefined, trialJobDetail);
} }
const newTrialJobDetail: TrialJobDetail | undefined = this.trialJobs.get(trialJobId);
if (newTrialJobDetail !== undefined) {
newTrialJobDetail.message = trialJobDetail.message;
}
let hyperParams: string | undefined = undefined; let hyperParams: string | undefined = undefined;
switch (trialJobDetail.status) { switch (trialJobDetail.status) {
case 'SUCCEEDED': case 'SUCCEEDED':
...@@ -584,7 +601,7 @@ class NNIManager implements Manager { ...@@ -584,7 +601,7 @@ class NNIManager implements Manager {
assert(allFinishedTrialJobNum <= waitSubmittedToFinish); assert(allFinishedTrialJobNum <= waitSubmittedToFinish);
if (allFinishedTrialJobNum >= waitSubmittedToFinish) { if (allFinishedTrialJobNum >= waitSubmittedToFinish) {
this.setStatus('DONE'); this.setStatus('DONE');
this.experimentProfile.endTime = Date.now(); this.setEndtime();
await this.storeExperimentProfile(); await this.storeExperimentProfile();
// write this log for travis CI // write this log for travis CI
this.log.info('Experiment done.'); this.log.info('Experiment done.');
...@@ -678,11 +695,15 @@ class NNIManager implements Manager { ...@@ -678,11 +695,15 @@ class NNIManager implements Manager {
private async onTrialJobMetrics(metric: TrialJobMetric): Promise<void> { private async onTrialJobMetrics(metric: TrialJobMetric): Promise<void> {
this.log.debug(`NNIManager received trial job metrics: ${metric}`); this.log.debug(`NNIManager received trial job metrics: ${metric}`);
await this.dataStore.storeMetricData(metric.id, metric.data); if (this.trialJobs.has(metric.id)){
if (this.dispatcher === undefined) { await this.dataStore.storeMetricData(metric.id, metric.data);
throw new Error('Error: tuner has not been setup'); if (this.dispatcher === undefined) {
throw new Error('Error: tuner has not been setup');
}
this.dispatcher.sendCommand(REPORT_METRIC_DATA, metric.data);
} else {
this.log.warning(`NNIManager received non-existent trial job metrics: ${metric}`);
} }
this.dispatcher.sendCommand(REPORT_METRIC_DATA, metric.data);
} }
private requestTrialJobs(jobNum: number): void { private requestTrialJobs(jobNum: number): void {
...@@ -780,6 +801,7 @@ class NNIManager implements Manager { ...@@ -780,6 +801,7 @@ class NNIManager implements Manager {
this.log.error(err.stack); this.log.error(err.stack);
} }
this.status.errors.push(err.message); this.status.errors.push(err.message);
this.setEndtime();
this.setStatus('ERROR'); this.setStatus('ERROR');
} }
...@@ -787,9 +809,15 @@ class NNIManager implements Manager { ...@@ -787,9 +809,15 @@ class NNIManager implements Manager {
if (status !== this.status.status) { if (status !== this.status.status) {
this.log.info(`Change NNIManager status from: ${this.status.status} to: ${status}`); this.log.info(`Change NNIManager status from: ${this.status.status} to: ${status}`);
this.status.status = status; this.status.status = status;
this.experimentManager.setExperimentInfo(this.experimentProfile.id, 'status', this.status.status);
} }
} }
private setEndtime(): void {
this.experimentProfile.endTime = Date.now();
this.experimentManager.setExperimentInfo(this.experimentProfile.id, 'endTime', this.experimentProfile.endTime);
}
private createEmptyExperimentProfile(): ExperimentProfile { private createEmptyExperimentProfile(): ExperimentProfile {
return { return {
id: getExperimentId(), id: getExperimentId(),
......
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'use strict';
import { assert, expect } from 'chai';
import * as fs from 'fs';
import { Container, Scope } from 'typescript-ioc';
import * as component from '../../common/component';
import { cleanupUnitTest, prepareUnitTest } from '../../common/utils';
import { ExperimentManager } from '../../common/experimentManager';
import { NNIExperimentsManager } from '../nniExperimentsManager';
describe('Unit test for experiment manager', function () {
let experimentManager: NNIExperimentsManager;
const mockedInfo = {
"test": {
"port": 8080,
"startTime": 1605246730756,
"endTime": "N/A",
"status": "INITIALIZED",
"platform": "local",
"experimentName": "testExp",
"tag": [], "pid": 11111,
"webuiUrl": [],
"logDir": null
}
}
before(() => {
prepareUnitTest();
fs.writeFileSync('.experiment.test', JSON.stringify(mockedInfo));
Container.bind(ExperimentManager).to(NNIExperimentsManager).scope(Scope.Singleton);
experimentManager = component.get(NNIExperimentsManager);
experimentManager.setExperimentPath('.experiment.test');
});
after(() => {
if (fs.existsSync('.experiment.test')) {
fs.unlinkSync('.experiment.test');
}
cleanupUnitTest();
});
it('test getExperimentsInfo', () => {
return experimentManager.getExperimentsInfo().then(function (experimentsInfo: {[key: string]: any}) {
new Array(experimentsInfo)
for (let idx in experimentsInfo) {
if (experimentsInfo[idx]['id'] === 'test') {
expect(experimentsInfo[idx]['status']).to.be.oneOf(['STOPPED', 'ERROR']);
break;
}
}
}).catch((error) => {
assert.fail(error);
})
});
});
...@@ -161,7 +161,7 @@ class MockedDataStore implements DataStore { ...@@ -161,7 +161,7 @@ class MockedDataStore implements DataStore {
} }
if (!(status && jobInfo.status !== status)) { if (!(status && jobInfo.status !== status)) {
if (jobInfo.status === 'SUCCEEDED') { if (jobInfo.status === 'SUCCEEDED') {
jobInfo.finalMetricData = await this.getFinalMetricData(jobInfo.id); jobInfo.finalMetricData = await this.getFinalMetricData(jobInfo.trialJobId);
} }
result.push(jobInfo); result.push(jobInfo);
} }
...@@ -206,7 +206,7 @@ class MockedDataStore implements DataStore { ...@@ -206,7 +206,7 @@ class MockedDataStore implements DataStore {
public getTrialJob(trialJobId: string): Promise<TrialJobInfo> { public getTrialJob(trialJobId: string): Promise<TrialJobInfo> {
return Promise.resolve({ return Promise.resolve({
id: '1234', trialJobId: '1234',
status: 'SUCCEEDED', status: 'SUCCEEDED',
startTime: Date.now(), startTime: Date.now(),
endTime: Date.now() endTime: Date.now()
...@@ -242,7 +242,7 @@ class MockedDataStore implements DataStore { ...@@ -242,7 +242,7 @@ class MockedDataStore implements DataStore {
jobInfo = map.get(record.trialJobId); jobInfo = map.get(record.trialJobId);
} else { } else {
jobInfo = { jobInfo = {
id: record.trialJobId, trialJobId: record.trialJobId,
status: this.getJobStatusByLatestEvent(record.event), status: this.getJobStatusByLatestEvent(record.event),
}; };
} }
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
'use strict'; 'use strict';
import * as fs from 'fs';
import * as os from 'os'; import * as os from 'os';
import { assert, expect } from 'chai'; import { assert, expect } from 'chai';
import { Container, Scope } from 'typescript-ioc'; import { Container, Scope } from 'typescript-ioc';
...@@ -10,9 +11,10 @@ import { Container, Scope } from 'typescript-ioc'; ...@@ -10,9 +11,10 @@ import { Container, Scope } from 'typescript-ioc';
import * as component from '../../common/component'; import * as component from '../../common/component';
import { Database, DataStore } from '../../common/datastore'; import { Database, DataStore } from '../../common/datastore';
import { Manager, ExperimentProfile} from '../../common/manager'; import { Manager, ExperimentProfile} from '../../common/manager';
import { ExperimentManager } from '../../common/experimentManager';
import { TrainingService } from '../../common/trainingService'; import { TrainingService } from '../../common/trainingService';
import { cleanupUnitTest, prepareUnitTest } from '../../common/utils'; import { cleanupUnitTest, prepareUnitTest } from '../../common/utils';
import { NNIDataStore } from '../nniDataStore'; import { NNIExperimentsManager } from '../nniExperimentsManager';
import { NNIManager } from '../nnimanager'; import { NNIManager } from '../nnimanager';
import { SqlDB } from '../sqlDatabase'; import { SqlDB } from '../sqlDatabase';
import { MockedTrainingService } from './mockedTrainingService'; import { MockedTrainingService } from './mockedTrainingService';
...@@ -25,6 +27,7 @@ async function initContainer(): Promise<void> { ...@@ -25,6 +27,7 @@ async function initContainer(): Promise<void> {
Container.bind(Manager).to(NNIManager).scope(Scope.Singleton); Container.bind(Manager).to(NNIManager).scope(Scope.Singleton);
Container.bind(Database).to(SqlDB).scope(Scope.Singleton); Container.bind(Database).to(SqlDB).scope(Scope.Singleton);
Container.bind(DataStore).to(MockedDataStore).scope(Scope.Singleton); Container.bind(DataStore).to(MockedDataStore).scope(Scope.Singleton);
Container.bind(ExperimentManager).to(NNIExperimentsManager).scope(Scope.Singleton);
await component.get<DataStore>(DataStore).init(); await component.get<DataStore>(DataStore).init();
} }
...@@ -87,9 +90,26 @@ describe('Unit test for nnimanager', function () { ...@@ -87,9 +90,26 @@ describe('Unit test for nnimanager', function () {
revision: 0 revision: 0
} }
let mockedInfo = {
"unittest": {
"port": 8080,
"startTime": 1605246730756,
"endTime": "N/A",
"status": "INITIALIZED",
"platform": "local",
"experimentName": "testExp",
"tag": [], "pid": 11111,
"webuiUrl": [],
"logDir": null
}
}
before(async () => { before(async () => {
await initContainer(); await initContainer();
fs.writeFileSync('.experiment.test', JSON.stringify(mockedInfo));
const experimentsManager: ExperimentManager = component.get(ExperimentManager);
experimentsManager.setExperimentPath('.experiment.test');
nniManager = component.get(Manager); nniManager = component.get(Manager);
const expId: string = await nniManager.startExperiment(experimentParams); const expId: string = await nniManager.startExperiment(experimentParams);
assert.strictEqual(expId, 'unittest'); assert.strictEqual(expId, 'unittest');
...@@ -122,7 +142,7 @@ describe('Unit test for nnimanager', function () { ...@@ -122,7 +142,7 @@ describe('Unit test for nnimanager', function () {
it('test getTrialJob valid', () => { it('test getTrialJob valid', () => {
//query a exist id //query a exist id
return nniManager.getTrialJob('1234').then(function (trialJobDetail) { return nniManager.getTrialJob('1234').then(function (trialJobDetail) {
expect(trialJobDetail.id).to.be.equal('1234'); expect(trialJobDetail.trialJobId).to.be.equal('1234');
}).catch((error) => { }).catch((error) => {
assert.fail(error); assert.fail(error);
}) })
......
...@@ -12,13 +12,16 @@ import { Database, DataStore } from './common/datastore'; ...@@ -12,13 +12,16 @@ import { Database, DataStore } from './common/datastore';
import { setExperimentStartupInfo } from './common/experimentStartupInfo'; import { setExperimentStartupInfo } from './common/experimentStartupInfo';
import { getLogger, Logger, logLevelNameMap } from './common/log'; import { getLogger, Logger, logLevelNameMap } from './common/log';
import { Manager, ExperimentStartUpMode } from './common/manager'; import { Manager, ExperimentStartUpMode } from './common/manager';
import { ExperimentManager } from './common/experimentManager';
import { TrainingService } from './common/trainingService'; import { TrainingService } from './common/trainingService';
import { getLogDir, mkDirP, parseArg, uniqueString } from './common/utils'; import { getLogDir, mkDirP, parseArg } from './common/utils';
import { NNIDataStore } from './core/nniDataStore'; import { NNIDataStore } from './core/nniDataStore';
import { NNIManager } from './core/nnimanager'; import { NNIManager } from './core/nnimanager';
import { SqlDB } from './core/sqlDatabase'; import { SqlDB } from './core/sqlDatabase';
import { NNIExperimentsManager } from './core/nniExperimentsManager';
import { NNIRestServer } from './rest_server/nniRestServer'; import { NNIRestServer } from './rest_server/nniRestServer';
import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService'; import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService';
import { AdlTrainingService } from './training_service/kubernetes/adl/adlTrainingService';
import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService'; import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService';
import { LocalTrainingService } from './training_service/local/localTrainingService'; import { LocalTrainingService } from './training_service/local/localTrainingService';
import { RouterTrainingService } from './training_service/reusable/routerTrainingService'; import { RouterTrainingService } from './training_service/reusable/routerTrainingService';
...@@ -26,15 +29,18 @@ import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTr ...@@ -26,15 +29,18 @@ import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTr
import { DLTSTrainingService } from './training_service/dlts/dltsTrainingService'; import { DLTSTrainingService } from './training_service/dlts/dltsTrainingService';
function initStartupInfo( function initStartupInfo(
startExpMode: string, resumeExperimentId: string, basePort: number, platform: string, startExpMode: string, experimentId: string, basePort: number, platform: string,
logDirectory: string, experimentLogLevel: string, readonly: boolean): void { logDirectory: string, experimentLogLevel: string, readonly: boolean): void {
const createNew: boolean = (startExpMode === ExperimentStartUpMode.NEW); const createNew: boolean = (startExpMode === ExperimentStartUpMode.NEW);
const expId: string = createNew ? uniqueString(8) : resumeExperimentId; setExperimentStartupInfo(createNew, experimentId, basePort, platform, logDirectory, experimentLogLevel, readonly);
setExperimentStartupInfo(createNew, expId, basePort, platform, logDirectory, experimentLogLevel, readonly);
} }
async function initContainer(foreground: boolean, platformMode: string, logFileName?: string): Promise<void> { async function initContainer(foreground: boolean, platformMode: string, logFileName?: string): Promise<void> {
if (platformMode === 'local') { if (platformMode === 'adl') {
Container.bind(TrainingService)
.to(AdlTrainingService)
.scope(Scope.Singleton);
} else if (platformMode === 'local') {
Container.bind(TrainingService) Container.bind(TrainingService)
.to(LocalTrainingService) .to(LocalTrainingService)
.scope(Scope.Singleton); .scope(Scope.Singleton);
...@@ -78,6 +84,9 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN ...@@ -78,6 +84,9 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
Container.bind(DataStore) Container.bind(DataStore)
.to(NNIDataStore) .to(NNIDataStore)
.scope(Scope.Singleton); .scope(Scope.Singleton);
Container.bind(ExperimentManager)
.to(NNIExperimentsManager)
.scope(Scope.Singleton);
const DEFAULT_LOGFILE: string = path.join(getLogDir(), 'nnimanager.log'); const DEFAULT_LOGFILE: string = path.join(getLogDir(), 'nnimanager.log');
if (foreground) { if (foreground) {
logFileName = undefined; logFileName = undefined;
...@@ -94,7 +103,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN ...@@ -94,7 +103,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
function usage(): void { function usage(): void {
console.info('usage: node main.js --port <port> --mode \ console.info('usage: node main.js --port <port> --mode \
<local/remote/pai/kubeflow/frameworkcontroller/paiYarn/aml> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>'); <adl/local/remote/pai/kubeflow/frameworkcontroller/paiYarn/aml> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>');
} }
const strPort: string = parseArg(['--port', '-p']); const strPort: string = parseArg(['--port', '-p']);
...@@ -114,7 +123,7 @@ const foreground: boolean = foregroundArg.toLowerCase() === 'true' ? true : fals ...@@ -114,7 +123,7 @@ const foreground: boolean = foregroundArg.toLowerCase() === 'true' ? true : fals
const port: number = parseInt(strPort, 10); const port: number = parseInt(strPort, 10);
const mode: string = parseArg(['--mode', '-m']); const mode: string = parseArg(['--mode', '-m']);
if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml'].includes(mode)) { if (!['adl', 'local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml'].includes(mode)) {
console.log(`FATAL: unknown mode: ${mode}`); console.log(`FATAL: unknown mode: ${mode}`);
usage(); usage();
process.exit(1); process.exit(1);
...@@ -128,7 +137,7 @@ if (![ExperimentStartUpMode.NEW, ExperimentStartUpMode.RESUME].includes(startMod ...@@ -128,7 +137,7 @@ if (![ExperimentStartUpMode.NEW, ExperimentStartUpMode.RESUME].includes(startMod
} }
const experimentId: string = parseArg(['--experiment_id', '-id']); const experimentId: string = parseArg(['--experiment_id', '-id']);
if ((startMode === ExperimentStartUpMode.RESUME) && experimentId.trim().length < 1) { if (experimentId.trim().length < 1) {
console.log(`FATAL: cannot resume the experiment, invalid experiment_id: ${experimentId}`); console.log(`FATAL: cannot resume the experiment, invalid experiment_id: ${experimentId}`);
usage(); usage();
process.exit(1); process.exit(1);
...@@ -174,30 +183,14 @@ mkDirP(getLogDir()) ...@@ -174,30 +183,14 @@ mkDirP(getLogDir())
console.error(`Failed to create log dir: ${err.stack}`); console.error(`Failed to create log dir: ${err.stack}`);
}); });
function getStopSignal(): any { async function cleanUp(): Promise<void> {
if (process.platform === "win32") {
return 'SIGBREAK';
}
else {
return 'SIGTERM';
}
}
function getCtrlCSignal(): any {
return 'SIGINT';
}
process.on(getCtrlCSignal(), async () => {
const log: Logger = getLogger();
log.info(`Get SIGINT signal!`);
});
process.on(getStopSignal(), async () => {
const log: Logger = getLogger(); const log: Logger = getLogger();
let hasError: boolean = false; let hasError: boolean = false;
try { try {
const nniManager: Manager = component.get(Manager); const nniManager: Manager = component.get(Manager);
await nniManager.stopExperiment(); await nniManager.stopExperiment();
const experimentManager: ExperimentManager = component.get(ExperimentManager);
await experimentManager.stop();
const ds: DataStore = component.get(DataStore); const ds: DataStore = component.get(DataStore);
await ds.close(); await ds.close();
const restServer: NNIRestServer = component.get(NNIRestServer); const restServer: NNIRestServer = component.get(NNIRestServer);
...@@ -206,7 +199,11 @@ process.on(getStopSignal(), async () => { ...@@ -206,7 +199,11 @@ process.on(getStopSignal(), async () => {
hasError = true; hasError = true;
log.error(`${err.stack}`); log.error(`${err.stack}`);
} finally { } finally {
await log.close(); log.close();
process.exit(hasError ? 1 : 0); process.exit(hasError ? 1 : 0);
} }
}); }
process.on('SIGTERM', cleanUp);
process.on('SIGBREAK', cleanUp);
process.on('SIGINT', cleanUp);
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
"ignore": "^5.1.4", "ignore": "^5.1.4",
"js-base64": "^2.4.9", "js-base64": "^2.4.9",
"kubernetes-client": "^6.5.0", "kubernetes-client": "^6.5.0",
"lockfile": "^1.0.4",
"python-shell": "^2.0.1", "python-shell": "^2.0.1",
"rx": "^4.1.0", "rx": "^4.1.0",
"sqlite3": "^5.0.0", "sqlite3": "^5.0.0",
...@@ -39,6 +40,7 @@ ...@@ -39,6 +40,7 @@
"@types/glob": "^7.1.1", "@types/glob": "^7.1.1",
"@types/js-base64": "^2.3.1", "@types/js-base64": "^2.3.1",
"@types/js-yaml": "^3.12.5", "@types/js-yaml": "^3.12.5",
"@types/lockfile": "^1.0.0",
"@types/mocha": "^8.0.3", "@types/mocha": "^8.0.3",
"@types/node": "10.12.18", "@types/node": "10.12.18",
"@types/request": "^2.47.1", "@types/request": "^2.47.1",
......
...@@ -12,6 +12,7 @@ import { NNIError, NNIErrorNames } from '../common/errors'; ...@@ -12,6 +12,7 @@ import { NNIError, NNIErrorNames } from '../common/errors';
import { isNewExperiment, isReadonly } from '../common/experimentStartupInfo'; import { isNewExperiment, isReadonly } from '../common/experimentStartupInfo';
import { getLogger, Logger } from '../common/log'; import { getLogger, Logger } from '../common/log';
import { ExperimentProfile, Manager, TrialJobStatistics } from '../common/manager'; import { ExperimentProfile, Manager, TrialJobStatistics } from '../common/manager';
import { ExperimentManager } from '../common/experimentManager';
import { ValidationSchemas } from './restValidationSchemas'; import { ValidationSchemas } from './restValidationSchemas';
import { NNIRestServer } from './nniRestServer'; import { NNIRestServer } from './nniRestServer';
import { getVersion } from '../common/utils'; import { getVersion } from '../common/utils';
...@@ -21,10 +22,12 @@ const expressJoi = require('express-joi-validator'); ...@@ -21,10 +22,12 @@ const expressJoi = require('express-joi-validator');
class NNIRestHandler { class NNIRestHandler {
private restServer: NNIRestServer; private restServer: NNIRestServer;
private nniManager: Manager; private nniManager: Manager;
private experimentsManager: ExperimentManager;
private log: Logger; private log: Logger;
constructor(rs: NNIRestServer) { constructor(rs: NNIRestServer) {
this.nniManager = component.get(Manager); this.nniManager = component.get(Manager);
this.experimentsManager = component.get(ExperimentManager);
this.restServer = rs; this.restServer = rs;
this.log = getLogger(); this.log = getLogger();
} }
...@@ -60,6 +63,7 @@ class NNIRestHandler { ...@@ -60,6 +63,7 @@ class NNIRestHandler {
this.getLatestMetricData(router); this.getLatestMetricData(router);
this.getTrialLog(router); this.getTrialLog(router);
this.exportData(router); this.exportData(router);
this.getExperimentsInfo(router);
// Express-joi-validator configuration // Express-joi-validator configuration
router.use((err: any, _req: Request, res: Response, _next: any) => { router.use((err: any, _req: Request, res: Response, _next: any) => {
...@@ -209,6 +213,7 @@ class NNIRestHandler { ...@@ -209,6 +213,7 @@ class NNIRestHandler {
this.nniManager.listTrialJobs(req.query.status).then((jobInfos: TrialJobInfo[]) => { this.nniManager.listTrialJobs(req.query.status).then((jobInfos: TrialJobInfo[]) => {
jobInfos.forEach((trialJob: TrialJobInfo) => { jobInfos.forEach((trialJob: TrialJobInfo) => {
this.setErrorPathForFailedJob(trialJob); this.setErrorPathForFailedJob(trialJob);
this.setMessageforJob(trialJob);
}); });
res.send(jobInfos); res.send(jobInfos);
}).catch((err: Error) => { }).catch((err: Error) => {
...@@ -221,6 +226,7 @@ class NNIRestHandler { ...@@ -221,6 +226,7 @@ class NNIRestHandler {
router.get('/trial-jobs/:id', (req: Request, res: Response) => { router.get('/trial-jobs/:id', (req: Request, res: Response) => {
this.nniManager.getTrialJob(req.params.id).then((jobDetail: TrialJobInfo) => { this.nniManager.getTrialJob(req.params.id).then((jobDetail: TrialJobInfo) => {
const jobInfo: TrialJobInfo = this.setErrorPathForFailedJob(jobDetail); const jobInfo: TrialJobInfo = this.setErrorPathForFailedJob(jobDetail);
this.setMessageforJob(jobInfo);
res.send(jobInfo); res.send(jobInfo);
}).catch((err: Error) => { }).catch((err: Error) => {
this.handleError(err, res); this.handleError(err, res);
...@@ -303,6 +309,16 @@ class NNIRestHandler { ...@@ -303,6 +309,16 @@ class NNIRestHandler {
}); });
} }
private getExperimentsInfo(router: Router): void {
router.get('/experiments-info', (req: Request, res: Response) => {
this.experimentsManager.getExperimentsInfo().then((experimentInfo: JSON) => {
res.send(JSON.stringify(experimentInfo));
}).catch((err: Error) => {
this.handleError(err, res);
});
});
}
private setErrorPathForFailedJob(jobInfo: TrialJobInfo): TrialJobInfo { private setErrorPathForFailedJob(jobInfo: TrialJobInfo): TrialJobInfo {
if (jobInfo === undefined || jobInfo.status !== 'FAILED' || jobInfo.logPath === undefined) { if (jobInfo === undefined || jobInfo.status !== 'FAILED' || jobInfo.logPath === undefined) {
return jobInfo; return jobInfo;
...@@ -311,6 +327,14 @@ class NNIRestHandler { ...@@ -311,6 +327,14 @@ class NNIRestHandler {
return jobInfo; return jobInfo;
} }
private setMessageforJob(jobInfo: TrialJobInfo): TrialJobInfo {
if (jobInfo === undefined){
return jobInfo
}
jobInfo.message = this.nniManager.getTrialJobMessage(jobInfo.trialJobId);
return jobInfo
}
} }
export function createRestHandler(rs: NNIRestServer): Router { export function createRestHandler(rs: NNIRestServer): Router {
......
...@@ -32,6 +32,9 @@ export namespace ValidationSchemas { ...@@ -32,6 +32,9 @@ export namespace ValidationSchemas {
outputDir: joi.string(), outputDir: joi.string(),
cpuNum: joi.number().min(1), cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100), memoryMB: joi.number().min(100),
// ############## adl cpu and memory config ###############
memorySize: joi.string(),
// ########################################################
gpuNum: joi.number().min(0), gpuNum: joi.number().min(0),
command: joi.string().min(1), command: joi.string().min(1),
virtualCluster: joi.string(), virtualCluster: joi.string(),
...@@ -93,6 +96,20 @@ export namespace ValidationSchemas { ...@@ -93,6 +96,20 @@ export namespace ValidationSchemas {
minFailedTaskCount: joi.number(), minFailedTaskCount: joi.number(),
minSucceededTaskCount: joi.number() minSucceededTaskCount: joi.number()
}) })
}),
imagePullSecrets: joi.array({
name: joi.string().min(1).required()
}),
// ############## adl ###############
adaptive: joi.boolean(),
checkpoint: joi.object({
storageClass: joi.string().min(1).required(),
storageSize: joi.string().min(1).required()
}),
nfs: joi.object({
server: joi.string().min(1).required(),
path: joi.string().min(1).required(),
containerMountPath: joi.string().min(1).required()
}) })
}), }),
pai_yarn_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase pai_yarn_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment