Unverified Commit 10c177c2 authored by Junwei Sun's avatar Junwei Sun Committed by GitHub
Browse files

support display trial log on local mode (#2718)

parent e2a86899
......@@ -4,7 +4,7 @@
'use strict';
import { MetricDataRecord, MetricType, TrialJobInfo } from './datastore';
import { TrialJobStatus } from './trainingService';
import { TrialJobStatus, LogType } from './trainingService';
type ProfileUpdateType = 'TRIAL_CONCURRENCY' | 'MAX_EXEC_DURATION' | 'SEARCH_SPACE' | 'MAX_TRIAL_NUM';
type ExperimentStatus = 'INITIALIZED' | 'RUNNING' | 'ERROR' | 'STOPPING' | 'STOPPED' | 'DONE' | 'NO_MORE_TRIAL' | 'TUNER_NO_MORE_TRIAL';
......@@ -101,6 +101,8 @@ abstract class Manager {
public abstract getMetricDataByRange(minSeqId: number, maxSeqId: number): Promise<MetricDataRecord[]>;
public abstract getLatestMetricData(): Promise<MetricDataRecord[]>;
public abstract getTrialLog(trialJobId: string, logType: LogType): Promise<string>;
public abstract getTrialJobStatistics(): Promise<TrialJobStatistics[]>;
public abstract getStatus(): NNIManagerStatus;
}
......
......@@ -8,6 +8,8 @@
*/
type TrialJobStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED' | 'SYS_CANCELED' | 'EARLY_STOPPED';
type LogType = 'TRIAL_LOG' | 'TRIAL_ERROR';
interface TrainingServiceMetadata {
readonly key: string;
readonly value: string;
......@@ -79,6 +81,7 @@ abstract class TrainingService {
public abstract updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise<TrialJobDetail>;
public abstract get isMultiPhaseJobSupported(): boolean;
public abstract cancelTrialJob(trialJobId: string, isEarlyStopped?: boolean): Promise<void>;
public abstract getTrialLog(trialJobId: string, logType: LogType): Promise<string>;
public abstract setClusterMetadata(key: string, value: string): Promise<void>;
public abstract getClusterMetadata(key: string): Promise<string>;
public abstract cleanUp(): Promise<void>;
......@@ -98,5 +101,5 @@ class NNIManagerIpConfig {
export {
TrainingService, TrainingServiceError, TrialJobStatus, TrialJobApplicationForm,
TrainingServiceMetadata, TrialJobDetail, TrialJobMetric, HyperParameters,
NNIManagerIpConfig
NNIManagerIpConfig, LogType
};
......@@ -16,7 +16,7 @@ import {
NNIManagerStatus, ProfileUpdateType, TrialJobStatistics
} from '../common/manager';
import {
TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus
TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus, LogType
} from '../common/trainingService';
import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP, getTunerProc, getLogLevel, isAlive, killPid } from '../common/utils';
import {
......@@ -325,6 +325,10 @@ class NNIManager implements Manager {
// FIXME: unit test
}
public async getTrialLog(trialJobId: string, logType: LogType): Promise<string> {
return this.trainingService.getTrialLog(trialJobId, logType);
}
public getExperimentProfile(): Promise<ExperimentProfile> {
// TO DO: using Promise.resolve()
const deferred: Deferred<ExperimentProfile> = new Deferred<ExperimentProfile>();
......
......@@ -7,7 +7,7 @@ import { Deferred } from 'ts-deferred';
import { Provider } from 'typescript-ioc';
import { MethodNotImplementedError } from '../../common/errors';
import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService';
import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService';
const testTrainingServiceProvider: Provider = {
get: () => { return new MockedTrainingService(); }
......@@ -63,6 +63,10 @@ class MockedTrainingService extends TrainingService {
return deferred.promise;
}
public getTrialLog(trialJobId: string, logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}
async run(): Promise<void> {
}
......
......@@ -57,6 +57,7 @@ class NNIRestHandler {
this.getMetricData(router);
this.getMetricDataByRange(router);
this.getLatestMetricData(router);
this.getTrialLog(router);
this.exportData(router);
// Express-joi-validator configuration
......@@ -268,6 +269,19 @@ class NNIRestHandler {
});
}
private getTrialLog(router: Router): void {
router.get('/trial-log/:id/:type', async(req: Request, res: Response) => {
this.nniManager.getTrialLog(req.params.id, req.params.type).then((log: string) => {
if (log === '') {
log = 'No logs available.'
}
res.send(log);
}).catch((err: Error) => {
this.handleError(err, res);
});
});
}
private exportData(router: Router): void {
router.get('/export-data', (req: Request, res: Response) => {
this.nniManager.exportData().then((exportedData: string) => {
......
......@@ -13,7 +13,7 @@ import {
TrialJobStatistics, NNIManagerStatus
} from '../../common/manager';
import {
TrialJobApplicationForm, TrialJobDetail, TrialJobStatus
TrialJobApplicationForm, TrialJobDetail, TrialJobStatus, LogType
} from '../../common/trainingService';
export const testManagerProvider: Provider = {
......@@ -118,6 +118,9 @@ export class MockedNNIManager extends Manager {
public getLatestMetricData(): Promise<MetricDataRecord[]> {
throw new MethodNotImplementedError();
}
public getTrialLog(trialJobId: string, logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}
public getExperimentProfile(): Promise<ExperimentProfile> {
const profile: ExperimentProfile = {
params: {
......
......@@ -12,9 +12,10 @@ import { EventEmitter } from 'events';
import { String } from 'typescript-string-operations';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import { MethodNotImplementedError } from '../../common/errors';
import {
NNIManagerIpConfig, TrainingService,
TrialJobApplicationForm, TrialJobDetail, TrialJobMetric
TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType
} from '../../common/trainingService';
import { DLTS_TRIAL_COMMAND_FORMAT } from './dltsData';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
......@@ -246,6 +247,10 @@ class DLTSTrainingService implements TrainingService {
return trialJob
}
public async getTrialLog(_trialJobId: string, _logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}
public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
this.metricsEmitter.on('metric', listener);
}
......
......@@ -12,8 +12,9 @@ import { Base64 } from 'js-base64';
import { String } from 'typescript-string-operations';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import { MethodNotImplementedError } from '../../common/errors';
import {
NNIManagerIpConfig, TrialJobDetail, TrialJobMetric
NNIManagerIpConfig, TrialJobDetail, TrialJobMetric, LogType
} from '../../common/trainingService';
import { delay, getExperimentRootDir, getIPV4Address, getJobCancelStatus, getVersion, uniqueString } from '../../common/utils';
import { AzureStorageClientUtility } from './azureStorageClientUtils';
......@@ -98,6 +99,10 @@ abstract class KubernetesTrainingService {
return Promise.resolve(kubernetesTrialJob);
}
public async getTrialLog(_trialJobId: string, _logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}
public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
this.metricsEmitter.on('metric', listener);
}
......
......@@ -14,7 +14,7 @@ import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import {
HyperParameters, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric, TrialJobStatus
TrialJobDetail, TrialJobMetric, TrialJobStatus, LogType
} from '../../common/trainingService';
import {
delay, generateParamFileName, getExperimentRootDir, getJobCancelStatus, getNewLine, isAlive, uniqueString
......@@ -184,6 +184,18 @@ class LocalTrainingService implements TrainingService {
return trialJob;
}
public async getTrialLog(trialJobId: string, logType: LogType): Promise<string> {
let logPath: string;
if (logType === 'TRIAL_LOG') {
logPath = path.join(this.rootDir, 'trials', trialJobId, 'trial.log');
} else if (logType === 'TRIAL_ERROR') {
logPath = path.join(this.rootDir, 'trials', trialJobId, 'stderr');
} else {
throw new Error('unexpected log type');
}
return fs.promises.readFile(logPath, 'utf8');
}
public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
this.eventEmitter.on('metric', listener);
}
......@@ -450,8 +462,8 @@ class LocalTrainingService implements TrainingService {
while (!this.stopping) {
while (!this.stopping && this.jobQueue.length !== 0) {
const trialJobId: string = this.jobQueue[0];
const trialJobDeatil: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId);
if (trialJobDeatil !== undefined && trialJobDeatil.status === 'WAITING') {
const trialJobDetail: LocalTrialJobDetail | undefined = this.jobMap.get(trialJobId);
if (trialJobDetail !== undefined && trialJobDetail.status === 'WAITING') {
const [success, resource] = this.tryGetAvailableResource();
if (!success) {
break;
......
......@@ -11,9 +11,10 @@ import { EventEmitter } from 'events';
import { Deferred } from 'ts-deferred';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import { MethodNotImplementedError } from '../../common/errors';
import {
NNIManagerIpConfig, TrainingService,
TrialJobApplicationForm, TrialJobDetail, TrialJobMetric
TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType
} from '../../common/trainingService';
import { delay } from '../../common/utils';
import { PAIJobInfoCollector } from './paiJobInfoCollector';
......@@ -117,6 +118,10 @@ abstract class PAITrainingService implements TrainingService {
return jobs;
}
public async getTrialLog(_trialJobId: string, _logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}
public async getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
if (this.paiClusterConfig === undefined) {
throw new Error('PAI Cluster config is not initialized');
......
......@@ -10,13 +10,13 @@ import * as path from 'path';
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
import { Deferred } from 'ts-deferred';
import * as component from '../../common/component';
import { NNIError, NNIErrorNames } from '../../common/errors';
import { NNIError, NNIErrorNames, MethodNotImplementedError } from '../../common/errors';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import { ObservableTimer } from '../../common/observableTimer';
import {
HyperParameters, NNIManagerIpConfig, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric
TrialJobDetail, TrialJobMetric, LogType
} from '../../common/trainingService';
import {
delay, generateParamFileName, getExperimentRootDir, getIPV4Address, getJobCancelStatus,
......@@ -180,6 +180,15 @@ class RemoteMachineTrainingService implements TrainingService {
}
}
/**
* Get trial job log
* @param _trialJobId ID of trial job
* @param _logType 'TRIAL_LOG' | 'TRIAL_STDERR'
*/
public async getTrialLog(_trialJobId: string, _logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}
/**
* Add job metrics listener
* @param listener callback listener
......
......@@ -6,7 +6,8 @@
import { Container, Scope } from 'typescript-ioc';
import * as component from '../../common/component';
import { getLogger, Logger } from '../../common/log';
import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService';
import { MethodNotImplementedError } from '../../common/errors'
import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType } from '../../common/trainingService';
import { delay } from '../../common/utils';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { PAIClusterConfig } from '../pai/paiConfig';
......@@ -47,6 +48,10 @@ class RouterTrainingService implements TrainingService {
return await this.internalTrainingService.getTrialJob(trialJobId);
}
public async getTrialLog(_trialJobId: string, _logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}
public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
if (this.internalTrainingService === undefined) {
throw new Error("TrainingService is not assigned!");
......
......@@ -9,10 +9,10 @@ import * as path from 'path';
import { Writable } from 'stream';
import { String } from 'typescript-string-operations';
import * as component from '../../common/component';
import { NNIError, NNIErrorNames } from '../../common/errors';
import { NNIError, NNIErrorNames, MethodNotImplementedError } from '../../common/errors';
import { getBasePort, getExperimentId, getPlatform } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService';
import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus, LogType } from '../../common/trainingService';
import { delay, getExperimentRootDir, getIPV4Address, getLogLevel, getVersion, mkDirPSync, uniqueString } from '../../common/utils';
import { GPU_INFO, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, REPORT_METRIC_DATA, SEND_TRIAL_JOB_PARAMETER, STDOUT, TRIAL_END, VERSION_CHECK } from '../../core/commands';
import { ScheduleResultType } from '../../training_service/common/gpuData';
......@@ -111,6 +111,10 @@ class TrialDispatcher implements TrainingService {
return trial;
}
public async getTrialLog(_trialJobId: string, _logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}
public async submitTrialJob(form: TrialJobApplicationForm): Promise<TrialDetail> {
if (this.trialConfig === undefined) {
throw new Error(`trialConfig not initialized!`);
......
......@@ -3,14 +3,14 @@
'use strict';
import * as assert from 'assert';
import * as chai from 'chai';
import * as chaiAsPromised from 'chai-as-promised';
import * as fs from 'fs';
import * as path from 'path';
import * as tmp from 'tmp';
import * as component from '../../common/component';
import { TrialJobApplicationForm, TrialJobDetail, TrainingService } from '../../common/trainingService';
import { cleanupUnitTest, delay, prepareUnitTest } from '../../common/utils';
import { TrialJobApplicationForm, TrialJobDetail} from '../../common/trainingService';
import { cleanupUnitTest, delay, prepareUnitTest, getExperimentRootDir } from '../../common/utils';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { LocalTrainingService } from '../local/localTrainingService';
......@@ -72,6 +72,36 @@ describe('Unit Test for LocalTrainingService', () => {
chai.expect(jobDetail.status).to.be.equals('USER_CANCELED');
}).timeout(20000);
it('Get trial log', async () => {
await localTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, trialConfig);
// submit job
const form: TrialJobApplicationForm = {
sequenceId: 0,
hyperParameters: {
value: 'mock hyperparameters',
index: 0
}
};
const jobDetail: TrialJobDetail = await localTrainingService.submitTrialJob(form);
// get trial log
const rootDir: string = getExperimentRootDir()
fs.mkdirSync(path.join(rootDir, 'trials'))
fs.mkdirSync(jobDetail.workingDirectory)
fs.writeFileSync(path.join(jobDetail.workingDirectory, 'trial.log'), 'trial log')
fs.writeFileSync(path.join(jobDetail.workingDirectory, 'stderr'), 'trial stderr')
chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_LOG')).to.be.equals('trial log');
chai.expect(await localTrainingService.getTrialLog(jobDetail.id, 'TRIAL_ERROR')).to.be.equals('trial stderr');
fs.unlinkSync(path.join(jobDetail.workingDirectory, 'trial.log'))
fs.unlinkSync(path.join(jobDetail.workingDirectory, 'stderr'))
fs.rmdirSync(jobDetail.workingDirectory)
fs.rmdirSync(path.join(rootDir, 'trials'))
await localTrainingService.cancelTrialJob(jobDetail.id);
}).timeout(20000);
it('Read metrics, Add listener, and remove listener', async () => {
// set meta data
const trialConfig: string = `{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"${localCodeDir}\",\"gpuNum\":0}`
......
......@@ -2,6 +2,7 @@ import * as React from 'react';
import * as copy from 'copy-to-clipboard';
import { Stack, PrimaryButton, Pivot, PivotItem } from 'office-ui-fabric-react';
import { Trial } from '../../static/model/trial';
import { MANAGER_IP } from '../../static/const';
import { EXPERIMENT, TRIALS } from '../../static/datamodel';
import JSONTree from 'react-json-tree';
import PaiTrialLog from '../public-child/PaiTrialLog';
......@@ -9,6 +10,7 @@ import TrialLog from '../public-child/TrialLog';
import MessageInfo from '../Modals/MessageInfo';
import '../../static/style/overview.scss';
import '../../static/style/copyParameter.scss';
import '../../static/style/openRow.scss';
interface OpenRowProps {
trialId: string;
......@@ -55,6 +57,10 @@ class OpenRow extends React.Component<OpenRowProps, OpenRowState> {
}
}
openTrialLog = (type: string): void => {
window.open(`${MANAGER_IP}/trial-log/${this.props.trialId}/${type}`);
}
render(): React.ReactNode {
const { isHidenInfo, typeInfo, info } = this.state;
const trialId = this.props.trialId;
......@@ -105,7 +111,23 @@ class OpenRow extends React.Component<OpenRowProps, OpenRowState> {
logCollection={EXPERIMENT.logCollectionEnabled}
/>
:
<TrialLog logStr={logPathRow} id={trialId} />
<div>
<TrialLog logStr={logPathRow} id={trialId} />
{/* view each trial log in drawer*/}
<div id="trialog">
<div className="copy" style={{ marginTop: 15 }}>
<PrimaryButton
onClick={this.openTrialLog.bind(this, 'TRIAL_LOG')}
text="View trial log"
/>
<PrimaryButton
onClick={this.openTrialLog.bind(this, 'TRIAL_ERROR')}
text="View trial error"
styles={{ root: { marginLeft: 15 } }}
/>
</div>
</div>
</div>
}
</PivotItem>
</Pivot>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment