Unverified Commit 806afeb6 authored by fishyds's avatar fishyds Committed by GitHub
Browse files

[Kubeflow Training Service] V1, merge from kubeflow branch to master branch (#382)

* Kubeflow TrainingService support, v1 (#373)

1. Create new Training Service: kubeflow trainning service, use 'kubectl' and kubeflow tfjobs CRD to submit and manage jobs
2. Update nni python SDK to support new kubeflow platform
3. Update nni python SDK's get_sequende_id() implementation, read NNI_TRIAL_SEQ_ID env variable, instead of reading .nni/sequence_id file
4. This version only supports Tensorflow operator. Will add more operators' support in future versions
parent b749266d
...@@ -36,8 +36,8 @@ import { LocalTrainingServiceForGPU } from './training_service/local/localTraini ...@@ -36,8 +36,8 @@ import { LocalTrainingServiceForGPU } from './training_service/local/localTraini
import { import {
RemoteMachineTrainingService RemoteMachineTrainingService
} from './training_service/remote_machine/remoteMachineTrainingService'; } from './training_service/remote_machine/remoteMachineTrainingService';
import { PAITrainingService } from './training_service/pai/paiTrainingService' import { PAITrainingService } from './training_service/pai/paiTrainingService';
import { KubeflowTrainingService } from './training_service/kubeflow/kubeflowTrainingService';
function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number) { function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number) {
const createNew: boolean = (startExpMode === 'new'); const createNew: boolean = (startExpMode === 'new');
...@@ -52,6 +52,8 @@ async function initContainer(platformMode: string): Promise<void> { ...@@ -52,6 +52,8 @@ async function initContainer(platformMode: string): Promise<void> {
Container.bind(TrainingService).to(RemoteMachineTrainingService).scope(Scope.Singleton); Container.bind(TrainingService).to(RemoteMachineTrainingService).scope(Scope.Singleton);
} else if (platformMode === 'pai') { } else if (platformMode === 'pai') {
Container.bind(TrainingService).to(PAITrainingService).scope(Scope.Singleton); Container.bind(TrainingService).to(PAITrainingService).scope(Scope.Singleton);
} else if (platformMode === 'kubeflow') {
Container.bind(TrainingService).to(KubeflowTrainingService).scope(Scope.Singleton);
} else { } else {
throw new Error(`Error: unsupported mode: ${mode}`); throw new Error(`Error: unsupported mode: ${mode}`);
} }
...@@ -76,19 +78,22 @@ if (!strPort || strPort.length === 0) { ...@@ -76,19 +78,22 @@ if (!strPort || strPort.length === 0) {
const port: number = parseInt(strPort, 10); const port: number = parseInt(strPort, 10);
const mode: string = parseArg(['--mode', '-m']); const mode: string = parseArg(['--mode', '-m']);
if (!['local', 'remote', 'pai'].includes(mode)) { if (!['local', 'remote', 'pai', 'kubeflow'].includes(mode)) {
console.log(`FATAL: unknown mode: ${mode}`);
usage(); usage();
process.exit(1); process.exit(1);
} }
const startMode: string = parseArg(['--start_mode', '-s']); const startMode: string = parseArg(['--start_mode', '-s']);
if (!['new', 'resume'].includes(startMode)) { if (!['new', 'resume'].includes(startMode)) {
console.log(`FATAL: unknown start_mode: ${startMode}`);
usage(); usage();
process.exit(1); process.exit(1);
} }
const experimentId: string = parseArg(['--experiment_id', '-id']); const experimentId: string = parseArg(['--experiment_id', '-id']);
if (startMode === 'resume' && experimentId.trim().length < 1) { if (startMode === 'resume' && experimentId.trim().length < 1) {
console.log(`FATAL: cannot resume experiment, invalid experiment_id: ${experimentId}`);
usage(); usage();
process.exit(1); process.exit(1);
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
"express": "^4.16.3", "express": "^4.16.3",
"express-joi-validator": "^2.0.0", "express-joi-validator": "^2.0.0",
"node-nvidia-smi": "^1.0.0", "node-nvidia-smi": "^1.0.0",
"node-yaml": "^3.1.1",
"rx": "^4.1.0", "rx": "^4.1.0",
"sqlite3": "^4.0.2", "sqlite3": "^4.0.2",
"ssh2": "^0.6.1", "ssh2": "^0.6.1",
......
...@@ -89,9 +89,7 @@ class NNIRestHandler { ...@@ -89,9 +89,7 @@ class NNIRestHandler {
return router; return router;
} }
private handle_error(err: Error, res: Response): void { private handle_error(err: Error, res: Response, isFatal: boolean = false): void {
this.log.info(err);
if (err instanceof NNIError && err.name === NNIErrorNames.NOT_FOUND) { if (err instanceof NNIError && err.name === NNIErrorNames.NOT_FOUND) {
res.status(404); res.status(404);
} else { } else {
...@@ -100,6 +98,14 @@ class NNIRestHandler { ...@@ -100,6 +98,14 @@ class NNIRestHandler {
res.send({ res.send({
error: err.message error: err.message
}); });
// If it's a fatal error, exit process
if(isFatal) {
this.log.critical(err);
process.exit(1);
}
this.log.error(err);
} }
// TODO add validators for request params, query, body // TODO add validators for request params, query, body
...@@ -145,12 +151,14 @@ class NNIRestHandler { ...@@ -145,12 +151,14 @@ class NNIRestHandler {
experiment_id: eid experiment_id: eid
}); });
}).catch((err: Error) => { }).catch((err: Error) => {
// Start experiment is a step of initialization, so any exception thrown is a fatal
this.handle_error(err, res); this.handle_error(err, res);
}); });
} else { } else {
this.nniManager.resumeExperiment().then(() => { this.nniManager.resumeExperiment().then(() => {
res.send(); res.send();
}).catch((err: Error) => { }).catch((err: Error) => {
// Resume experiment is a step of initialization, so any exception thrown is a fatal
this.handle_error(err, res); this.handle_error(err, res);
}); });
} }
...@@ -180,7 +188,8 @@ class NNIRestHandler { ...@@ -180,7 +188,8 @@ class NNIRestHandler {
} }
res.send(); res.send();
} catch (err) { } catch (err) {
this.handle_error(err, res); // setClusterMetata is a step of initialization, so any exception thrown is a fatal
this.handle_error(err, res, true);
} }
}); });
} }
......
...@@ -46,6 +46,14 @@ export namespace ValidationSchemas { ...@@ -46,6 +46,14 @@ export namespace ValidationSchemas {
userName: joi.string().min(1).required(), userName: joi.string().min(1).required(),
passWord: joi.string().min(1).required(), passWord: joi.string().min(1).required(),
host: joi.string().min(1).required() host: joi.string().min(1).required()
}),
kubeflow_config: joi.object({
operator: joi.string().min(1).required(),
nfs: joi.object({
server: joi.string().min(1).required(),
path: joi.string().min(1).required()
}).required(),
kubernetesServer: joi.string().min(1).required()
}) })
} }
}; };
......
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as assert from 'assert';
import { Request, Response, Router } from 'express';
import * as bodyParser from 'body-parser';
import * as component from '../../common/component';
import { getBasePort, getExperimentId } from '../../common/experimentStartupInfo';
import { RestServer } from '../../common/restServer'
/**
* Cluster Job Training service Rest server, provides rest API to support Cluster job metrics update
*
*/
@component.Singleton
export abstract class ClusterJobRestServer extends RestServer{
private readonly API_ROOT_URL: string = '/api/v1/nni-pai';
private readonly expId: string = getExperimentId();
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
constructor() {
super();
const basePort: number = getBasePort();
assert(basePort && basePort > 1024);
this.port = basePort + 1;
}
public get clusterRestServerPort(): number {
if(!this.port) {
throw new Error('PAI Rest server port is undefined');
}
return this.port;
}
/**
* NNIRestServer's own router registration
*/
protected registerRestHandler(): void {
this.app.use(bodyParser.json());
this.app.use(this.API_ROOT_URL, this.createRestHandler());
}
private createRestHandler() : Router {
const router: Router = Router();
// tslint:disable-next-line:typedef
router.use((req: Request, res: Response, next) => {
this.log.info(`${req.method}: ${req.url}: body:\n${JSON.stringify(req.body, undefined, 4)}`);
res.setHeader('Content-Type', 'application/json');
next();
});
router.post(`/update-metrics/${this.expId}/:trialId`, (req: Request, res: Response) => {
try {
this.log.info(`Get update-metrics request, trial job id is ${req.params.trialId}`);
this.log.info(`update-metrics body is ${JSON.stringify(req.body)}`);
this.handleTrialMetrics(req.body.jobId, req.body.metrics);
res.send();
}
catch(err) {
this.log.error(`json parse metrics error: ${err}`);
res.status(500);
res.send(err.message);
}
});
return router;
}
/** Abstract method to handle trial metrics data */
protected abstract handleTrialMetrics(jobId : string, trialMetrics : any[]) : void;
}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
export const CONTAINER_INSTALL_NNI_SHELL_FORMAT: string =
`#!/bin/bash
if python3 -c 'import nni' > /dev/null 2>&1; then
# nni module is already installed, skip
return
else
# Install nni
python3 -m pip install --user --upgrade nni
fi`;
\ No newline at end of file
...@@ -28,5 +28,6 @@ export enum TrialConfigMetadataKey { ...@@ -28,5 +28,6 @@ export enum TrialConfigMetadataKey {
EXPERIMENT_ID = 'experimentId', EXPERIMENT_ID = 'experimentId',
MULTI_PHASE = 'multiPhase', MULTI_PHASE = 'multiPhase',
RANDOM_SCHEDULER = 'random_scheduler', RANDOM_SCHEDULER = 'random_scheduler',
PAI_CLUSTER_CONFIG = 'pai_config' PAI_CLUSTER_CONFIG = 'pai_config',
KUBEFLOW_CLUSTER_CONFIG = 'kubeflow_config'
} }
import { TrialConfig } from "../common/trialConfig";
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
/** operator types that kubeflow supported */
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' | 'mxnet-operator' | 'caffe2-operator' | 'chainer-operator' | 'mpi-operator';
export type KubeflowOperatorPlural = 'tfjobs' | 'pytorchjobs' | 'mxjobs' | 'caffe2jobs' | 'chainerjobs' | 'mpijobs';
/**
* map from Kubeflow operator name to its plural name in K8S
*/
export const kubeflowOperatorMap : Map<KubeflowOperator, KubeflowOperatorPlural> = new Map<KubeflowOperator, KubeflowOperatorPlural>([
['tf-operator' , 'tfjobs'],
['pytorch-operator', 'pytorchjobs'],
['mxnet-operator', 'mxjobs'],
['caffe2-operator', 'caffe2jobs'],
['chainer-operator', 'chainerjobs'],
['mpi-operator', 'mpijobs']
]);
/**
* Kuberflow cluster configuration
*
*/
export class KubeflowClusterConfig {
/** Name of Kubeflow operator, like tf-operator */
public readonly operator: KubeflowOperator;
public readonly nfs: NFSConfig;
public readonly kubernetesServer: string;
/**
* Constructor
* @param userName User name of Kubeflow Cluster
* @param passWord password of Kubeflow Cluster
* @param host Host IP of Kubeflow Cluster
*/
constructor(operator: KubeflowOperator, nfs : NFSConfig, kubernetesServer : string) {
this.operator = operator;
this.nfs = nfs;
this.kubernetesServer = kubernetesServer;
}
}
/**
* NFS configuration to store Kubeflow job related files
*/
export class NFSConfig {
/** IP Adress of NFS server */
public readonly server : string;
/** exported NFS path on NFS server */
public readonly path : string;
constructor(server : string, path : string) {
this.server = server;
this.path = path;
}
}
/**
* Trial job configuration for Kubeflow
*/
export class KubeflowTrialConfig extends TrialConfig {
public readonly cpuNum: number;
public readonly memoryMB: number;
public readonly image: string;
constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, image: string) {
super(command, codeDir, gpuNum);
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
this.image = image;
}
}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
/**
* KubeflowTrialJobDetail
*/
// tslint:disable-next-line:max-classes-per-file
export class KubeflowTrialJobDetail implements TrialJobDetail {
public id: string;
public status: TrialJobStatus;
public submitTime: number;
public startTime?: number;
public endTime?: number;
public tags?: string[];
public url?: string;
public workingDirectory: string;
public form: JobApplicationForm;
public kubeflowJobName: string;
public sequenceId: number;
public queryJobFailedCount: number;
public k8sPluralName: string
constructor(id: string, status: TrialJobStatus, submitTime: number,
workingDirectory: string, form: JobApplicationForm,
kubeflowJobName: string, sequenceId: number, url: string, k8sPluralName: string) {
this.id = id;
this.status = status;
this.submitTime = submitTime;
this.workingDirectory = workingDirectory;
this.form = form;
this.kubeflowJobName = kubeflowJobName;
this.sequenceId = sequenceId;
this.tags = [];
this.queryJobFailedCount = 0;
this.url = url;
this.k8sPluralName = k8sPluralName;
}
}
export const KUBEFLOW_RUN_SHELL_FORMAT: string =
`#!/bin/bash
export NNI_PLATFORM=kubeflow
export NNI_SYS_DIR={0}
export NNI_OUTPUT_DIR={1}
export MULTI_PHASE=false
export NNI_TRIAL_JOB_ID={2}
export NNI_EXP_ID={3}
export NNI_CODE_DIR={4}
export NNI_TRIAL_SEQ_ID={5}
mkdir -p $NNI_SYS_DIR
mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh # Check and install NNI pkg
python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR//trialkeeper_stderr
`
export type KubeflowTFJobType = 'Created' | 'Running' | 'Failed' | 'Succeeded';
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as cpp from 'child-process-promise';
import { getLogger, Logger } from '../../common/log';
import { KubeflowTrialJobDetail, KubeflowTFJobType} from './kubeflowData';
import { NNIError, NNIErrorNames } from '../../common/errors';
import { TrialJobStatus } from '../../common/trainingService';
/**
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
*/
export class KubeflowJobInfoCollector {
private readonly trialJobsMap : Map<string, KubeflowTrialJobDetail>;
private readonly log: Logger = getLogger();
private readonly statusesNeedToCheck: TrialJobStatus[];
private readonly MAX_FAILED_QUERY_JOB_NUMBER: number = 30;
constructor(jobMap: Map<string, KubeflowTrialJobDetail>) {
this.trialJobsMap = jobMap;
this.statusesNeedToCheck = ['RUNNING', 'WAITING'];
}
public async retrieveTrialStatus() : Promise<void> {
const updateKubeflowTrialJobs : Promise<void>[] = [];
for(let [trialJobId, kubeflowTrialJob] of this.trialJobsMap) {
if (!kubeflowTrialJob) {
throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`);
}
// Since Kubeflow needs some delay to schedule jobs, we provide 20 seconds buffer time to check kubeflow job's status
if( Date.now() - kubeflowTrialJob.submitTime < 20 * 1000) {
return Promise.resolve();
}
updateKubeflowTrialJobs.push(this.retrieveSingleTrialJobInfo(kubeflowTrialJob))
}
await Promise.all(updateKubeflowTrialJobs);
}
private async retrieveSingleTrialJobInfo(kubeflowTrialJob : KubeflowTrialJobDetail) : Promise<void> {
if (!this.statusesNeedToCheck.includes(kubeflowTrialJob.status)) {
return Promise.resolve();
}
let result : cpp.childProcessPromise.Result;
try {
result = await cpp.exec(`kubectl get ${kubeflowTrialJob.k8sPluralName} ${kubeflowTrialJob.kubeflowJobName} -o json`);
if(result.stderr) {
this.log.error(`Get ${kubeflowTrialJob.k8sPluralName} ${kubeflowTrialJob.kubeflowJobName} failed. Error is ${result.stderr}, failed checking number is ${kubeflowTrialJob.queryJobFailedCount}`);
kubeflowTrialJob.queryJobFailedCount++;
if(kubeflowTrialJob.queryJobFailedCount >= this.MAX_FAILED_QUERY_JOB_NUMBER) {
kubeflowTrialJob.status = 'UNKNOWN';
}
}
} catch(error) {
this.log.error(`kubectl get ${kubeflowTrialJob.k8sPluralName} ${kubeflowTrialJob.kubeflowJobName} failed, error is ${error}`);
return Promise.resolve();
}
const kubeflowJobInfo = JSON.parse(result.stdout);
if(kubeflowJobInfo.status && kubeflowJobInfo.status.conditions) {
const latestCondition = kubeflowJobInfo.status.conditions[kubeflowJobInfo.status.conditions.length - 1];
const tfJobType : KubeflowTFJobType = <KubeflowTFJobType>latestCondition.type;
switch(tfJobType) {
case 'Created':
kubeflowTrialJob.status = 'WAITING';
kubeflowTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
case 'Running':
kubeflowTrialJob.status = 'RUNNING';
if(!kubeflowTrialJob.startTime) {
kubeflowTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
}
break;
case 'Failed':
kubeflowTrialJob.status = 'FAILED';
kubeflowTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
case 'Succeeded':
kubeflowTrialJob.status = 'SUCCEEDED';
kubeflowTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
default:
break;
}
}
return Promise.resolve();
}
}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as component from '../../common/component';
import { Inject } from 'typescript-ioc';
import { KubeflowTrainingService } from './kubeflowTrainingService';
import { ClusterJobRestServer } from '../common/clusterJobRestServer'
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*/
@component.Singleton
export class KubeflowJobRestServer extends ClusterJobRestServer{
@Inject
private readonly kubeflowTrainingService : KubeflowTrainingService;
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
constructor() {
super();
this.kubeflowTrainingService = component.get(KubeflowTrainingService);
}
protected handleTrialMetrics(jobId : string, metrics : any[]) : void {
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN
for (const singleMetric of metrics) {
this.kubeflowTrainingService.MetricsEmitter.emit('metric', {
id : jobId,
data : singleMetric
});
}
}
}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict'
import * as assert from 'assert';
import * as component from '../../common/component';
import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import * as path from 'path';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { EventEmitter } from 'events';
import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import { MethodNotImplementedError } from '../../common/errors';
import { String } from 'typescript-string-operations';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import {
JobApplicationForm, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric
} from '../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils';
import { KubeflowClusterConfig, kubeflowOperatorMap, KubeflowTrialConfig, NFSConfig } from './kubeflowConfig';
import { KubeflowTrialJobDetail, KUBEFLOW_RUN_SHELL_FORMAT } from './kubeflowData';
import { KubeflowJobRestServer } from './kubeflowJobRestServer';
import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector';
var yaml = require('node-yaml');
/**
* Training Service implementation for Kubeflow
* Refer https://github.com/kubeflow/kubeflow for more info about Kubeflow
*/
@component.Singleton
class KubeflowTrainingService implements TrainingService {
private readonly NNI_KUBEFLOW_TRIAL_LABEL = 'nni-kubeflow-trial';
private readonly log!: Logger;
private readonly metricsEmitter: EventEmitter;
private readonly trialJobsMap: Map<string, KubeflowTrialJobDetail>;
/** experiment root dir in NFS */
private readonly trialLocalNFSTempFolder: string;
private stopping: boolean = false;
private experimentId! : string;
private nextTrialSequenceId: number;
private kubeflowClusterConfig?: KubeflowClusterConfig;
private kubeflowTrialConfig?: KubeflowTrialConfig;
private kubeflowJobInfoCollector: KubeflowJobInfoCollector;
private kubeflowRestServerPort?: number;
private kubeflowJobPlural?: string;
private readonly CONTAINER_MOUNT_PATH: string;
constructor() {
this.log = getLogger();
this.metricsEmitter = new EventEmitter();
this.trialJobsMap = new Map<string, KubeflowTrialJobDetail>();
this.kubeflowJobInfoCollector = new KubeflowJobInfoCollector(this.trialJobsMap);
this.trialLocalNFSTempFolder = path.join(getExperimentRootDir(), 'trials-nfs-tmp');
this.experimentId = getExperimentId();
this.nextTrialSequenceId = -1;
this.CONTAINER_MOUNT_PATH = '/tmp/nfs';
}
public async run(): Promise<void> {
const restServer: KubeflowJobRestServer = component.get(KubeflowJobRestServer);
await restServer.start();
this.log.info(`Kubeflow Training service rest server listening on: ${restServer.endPoint}`);
while (!this.stopping) {
// collect metrics by calling 'kubectl get' command on Kubeflow jobs
await delay(3000);
await this.kubeflowJobInfoCollector.retrieveTrialStatus();
}
}
public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> {
if(!this.kubeflowClusterConfig) {
throw new Error('Kubeflow Cluster config is not initialized');
}
if(!this.kubeflowTrialConfig) {
throw new Error('Kubeflow trial config is not initialized');
}
if(!this.kubeflowJobPlural) {
throw new Error('Kubeflow job plural name is undefined');
}
if(!this.kubeflowRestServerPort) {
const restServer: KubeflowJobRestServer = component.get(KubeflowJobRestServer);
this.kubeflowRestServerPort = restServer.clusterRestServerPort;
}
const trialJobId: string = uniqueString(5);
const curTrialSequenceId: number = this.generateSequenceId();
// Set trial's NFS working folder
const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId);
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${this.kubeflowTrialConfig.codeDir} ${trialLocalTempFolder}`);
const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });
const kubeflowRunScriptContent: string = String.Format(
KUBEFLOW_RUN_SHELL_FORMAT,
`$PWD/nni/${trialJobId}`,
path.join(trialWorkingFolder, 'output'),
trialJobId,
getExperimentId(),
trialWorkingFolder,
curTrialSequenceId,
this.kubeflowTrialConfig.command,
getIPV4Address(),
this.kubeflowRestServerPort
);
//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
// Write file content ( run.sh and parameter.cfg ) to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run.sh'), kubeflowRunScriptContent, { encoding: 'utf8' });
// Write file content ( parameter.cfg ) to local tmp folders
const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form)
if(trialForm && trialForm.hyperParameters) {
await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)),
trialForm.hyperParameters.value, { encoding: 'utf8' });
}
const kubeflowJobYamlPath = path.join(trialLocalTempFolder, `kubeflow-job-${trialJobId}.yaml`);
const kubeflowJobName = `nni-exp-${this.experimentId}-trial-${trialJobId}`.toLowerCase();
const podResources : any = {};
podResources.requests = {
'memory': `${this.kubeflowTrialConfig.memoryMB}Mi`,
'cpu': `${this.kubeflowTrialConfig.cpuNum}`,
'nvidia.com/gpu': `${this.kubeflowTrialConfig.gpuNum}`
}
podResources.limits = Object.assign({}, podResources.requests);
// Generate kubeflow job resource yaml file for K8S
yaml.write(
kubeflowJobYamlPath,
this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, podResources),
'utf-8'
);
// Creat work dir for current trial in NFS directory
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`);
// Copy code files from local dir to NFS mounted dir
await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
const nfsConfig: NFSConfig = this.kubeflowClusterConfig.nfs;
const trialJobDetail: KubeflowTrialJobDetail = new KubeflowTrialJobDetail(
trialJobId,
'WAITING',
Date.now(),
trialWorkingFolder,
form,
kubeflowJobName,
curTrialSequenceId,
`nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`,
this.kubeflowJobPlural
);
// Create kubeflow training jobs
await cpp.exec(`kubectl create -f ${kubeflowJobYamlPath}`);
// Set trial job detail until kubectl create resource successfully
this.trialJobsMap.set(trialJobId, trialJobDetail);
return Promise.resolve(trialJobDetail);
}
public updateTrialJob(trialJobId: string, form: JobApplicationForm): Promise<TrialJobDetail> {
throw new MethodNotImplementedError();
}
public listTrialJobs(): Promise<TrialJobDetail[]> {
throw new MethodNotImplementedError();
}
public getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
if(!this.kubeflowClusterConfig) {
throw new Error('Kubeflow Cluster config is not initialized');
}
const kubeflowTrialJob: TrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if (!kubeflowTrialJob) {
return Promise.reject(`trial job ${trialJobId} not found`)
}
return Promise.resolve(kubeflowTrialJob);
}
public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void) {
this.metricsEmitter.on('metric', listener);
}
public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void) {
this.metricsEmitter.off('metric', listener);
}
public get isMultiPhaseJobSupported(): boolean {
return false;
}
public async cancelTrialJob(trialJobId: string): Promise<void> {
const trialJobDetail : KubeflowTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if(!trialJobDetail) {
const errorMessage: string = `CancelTrialJob: trial job id ${trialJobId} not found`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
if(!this.kubeflowJobPlural) {
const errorMessage: string = `CancelTrialJob: trial job id ${trialJobId} failed because kubeflowJobPlural is undefined`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
const result: cpp.childProcessPromise.Result = await cpp.exec(`kubectl delete ${this.kubeflowJobPlural} -l app=${this.NNI_KUBEFLOW_TRIAL_LABEL},expId=${getExperimentId()},trialId=${trialJobId}`);
if(result.stderr) {
const errorMessage: string = `kubectl delete ${this.kubeflowJobPlural} for trial ${trialJobId} failed: ${result.stderr}`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
trialJobDetail.endTime = Date.now();
trialJobDetail.status = 'USER_CANCELED';
return Promise.resolve();
}
public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) {
case TrialConfigMetadataKey.KUBEFLOW_CLUSTER_CONFIG:
this.kubeflowClusterConfig = <KubeflowClusterConfig>JSON.parse(value);
// If NFS config section is valid in config file, proceed to mount and config NFS
if(this.kubeflowClusterConfig.nfs) {
//Check and mount NFS mount point here
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}`);
const nfsServer: string = this.kubeflowClusterConfig.nfs.server;
const nfsPath: string = this.kubeflowClusterConfig.nfs.path;
try {
await cpp.exec(`sudo mount ${nfsServer}:${nfsPath} ${this.trialLocalNFSTempFolder}`);
} catch(error) {
const mountError: string = `Mount NFS ${nfsServer}:${nfsPath} to ${this.trialLocalNFSTempFolder} failed, error is ${error}`;
this.log.error(mountError);
throw new Error(mountError);
}
}
this.kubeflowJobPlural = kubeflowOperatorMap.get(this.kubeflowClusterConfig.operator);
break;
case TrialConfigMetadataKey.TRIAL_CONFIG:
if (!this.kubeflowClusterConfig){
this.log.error('kubeflow cluster config is not initialized');
return Promise.reject(new Error('kubeflow cluster config is not initialized'));
}
this.kubeflowTrialConfig = <KubeflowTrialConfig>JSON.parse(value);
break;
default:
break;
}
return Promise.resolve();
}
public getClusterMetadata(key: string): Promise<string> {
throw new MethodNotImplementedError();
}
public async cleanUp(): Promise<void> {
this.stopping = true;
// First, cancel all running kubeflow jobs
for(let [trialJobId, kubeflowTrialJob] of this.trialJobsMap) {
if(['RUNNING', 'WAITING', 'UNKNOWN'].includes(kubeflowTrialJob.status)) {
try {
await this.cancelTrialJob(trialJobId);
} catch(error) {} // DONT throw error during cleanup
kubeflowTrialJob.status = 'SYS_CANCELED';
}
}
assert(this.kubeflowJobPlural !== undefined);
// Delete all kubeflow jobs whose expId label is current experiment id
try {
await cpp.exec(`kubectl delete ${this.kubeflowJobPlural} -l app=${this.NNI_KUBEFLOW_TRIAL_LABEL},expId=${getExperimentId()}`);
} catch(error) {
this.log.error(`Delete ${this.kubeflowJobPlural} with label: app=${this.NNI_KUBEFLOW_TRIAL_LABEL},expId=${getExperimentId()} failed, error is ${error}`);
}
// Unmount NFS
try {
await cpp.exec(`sudo umount ${this.trialLocalNFSTempFolder}`);
} catch(error) {
this.log.error(`Unmount ${this.trialLocalNFSTempFolder} failed, error is ${error}`);
}
// Stop Kubeflow rest server
const restServer: KubeflowJobRestServer = component.get(KubeflowJobRestServer);
try {
await restServer.stop();
this.log.info('Kubeflow Training service rest server stopped successfully.');
} catch (error) {
this.log.error(`Kubeflow Training service rest server stopped failed, error: ${error.message}`);
Promise.reject(error);
}
return Promise.resolve();
}
public get MetricsEmitter() : EventEmitter {
return this.metricsEmitter;
}
private generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName : string, podResources : any) : any {
if(!this.kubeflowClusterConfig) {
throw new Error('Kubeflow Cluster config is not initialized');
}
if(!this.kubeflowTrialConfig) {
throw new Error('Kubeflow trial config is not initialized');
}
return {
apiVersion: 'kubeflow.org/v1alpha2',
kind: 'TFJob',
metadata: {
name: kubeflowJobName,
namespace: 'default',
labels: {
app: this.NNI_KUBEFLOW_TRIAL_LABEL,
expId: getExperimentId(),
trialId: trialJobId
}
},
spec: {
tfReplicaSpecs: {
Worker: {
replicas: 1,
template: {
metadata: {
creationTimestamp: null
},
spec: {
containers: [
{
// Kubeflow tensorflow operator requires that containers' name must be tensorflow
// TODO: change the name based on operator's type
name: 'tensorflow',
image: this.kubeflowTrialConfig.image,
args: ["sh", `${path.join(trialWorkingFolder, 'run.sh')}`],
volumeMounts: [{
name: 'nni-nfs-vol',
mountPath: this.CONTAINER_MOUNT_PATH
}],
resources: podResources//,
//workingDir: '/tmp/nni/nuDEP'
}],
restartPolicy: 'ExitCode',
volumes: [{
name: 'nni-nfs-vol',
nfs: {
server: `${this.kubeflowClusterConfig.nfs.server}`,
path: `${this.kubeflowClusterConfig.nfs.path}`
}
}]
}
}
}
}
}
};
}
private generateSequenceId(): number {
if (this.nextTrialSequenceId === -1) {
this.nextTrialSequenceId = getInitTrialSequenceId();
}
return this.nextTrialSequenceId++;
}
}
export { KubeflowTrainingService }
\ No newline at end of file
...@@ -320,6 +320,7 @@ class LocalTrainingService implements TrainingService { ...@@ -320,6 +320,7 @@ class LocalTrainingService implements TrainingService {
{ key: 'NNI_SYS_DIR', value: trialJobDetail.workingDirectory }, { key: 'NNI_SYS_DIR', value: trialJobDetail.workingDirectory },
{ key: 'NNI_TRIAL_JOB_ID', value: trialJobDetail.id }, { key: 'NNI_TRIAL_JOB_ID', value: trialJobDetail.id },
{ key: 'NNI_OUTPUT_DIR', value: trialJobDetail.workingDirectory }, { key: 'NNI_OUTPUT_DIR', value: trialJobDetail.workingDirectory },
{ key: 'NNI_TRIAL_SEQ_ID', value: trialJobDetail.sequenceId.toString() },
{ key: 'MULTI_PHASE', value: this.isMultiPhase.toString() } { key: 'MULTI_PHASE', value: this.isMultiPhase.toString() }
]; ];
} }
...@@ -368,7 +369,6 @@ class LocalTrainingService implements TrainingService { ...@@ -368,7 +369,6 @@ class LocalTrainingService implements TrainingService {
await cpp.exec(`touch ${path.join(trialJobDetail.workingDirectory, '.nni', 'metrics')}`); await cpp.exec(`touch ${path.join(trialJobDetail.workingDirectory, '.nni', 'metrics')}`);
await fs.promises.writeFile(path.join(trialJobDetail.workingDirectory, 'run.sh'), runScriptLines.join('\n'), { encoding: 'utf8', mode: 0o777 }); await fs.promises.writeFile(path.join(trialJobDetail.workingDirectory, 'run.sh'), runScriptLines.join('\n'), { encoding: 'utf8', mode: 0o777 });
await this.writeParameterFile(trialJobDetail.workingDirectory, (<TrialJobApplicationForm>trialJobDetail.form).hyperParameters); await this.writeParameterFile(trialJobDetail.workingDirectory, (<TrialJobApplicationForm>trialJobDetail.form).hyperParameters);
await this.writeSequenceIdFile(trialJobId);
const process: cp.ChildProcess = cp.exec(`bash ${path.join(trialJobDetail.workingDirectory, 'run.sh')}`); const process: cp.ChildProcess = cp.exec(`bash ${path.join(trialJobDetail.workingDirectory, 'run.sh')}`);
this.setTrialJobStatus(trialJobDetail, 'RUNNING'); this.setTrialJobStatus(trialJobDetail, 'RUNNING');
...@@ -450,13 +450,6 @@ class LocalTrainingService implements TrainingService { ...@@ -450,13 +450,6 @@ class LocalTrainingService implements TrainingService {
return this.trialSequenceId++; return this.trialSequenceId++;
} }
private async writeSequenceIdFile(trialJobId: string): Promise<void> {
const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId);
assert(trialJobDetail !== undefined);
const filepath: string = path.join(trialJobDetail.workingDirectory, '.nni', 'sequence_id');
await fs.promises.writeFile(filepath, trialJobDetail.sequenceId.toString(), { encoding: 'utf8' });
}
} }
export { LocalTrainingService }; export { LocalTrainingService };
...@@ -60,10 +60,10 @@ else ...@@ -60,10 +60,10 @@ else
fi`; fi`;
export const PAI_TRIAL_COMMAND_FORMAT: string = export const PAI_TRIAL_COMMAND_FORMAT: string =
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} `export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4}
&& cd $NNI_SYS_DIR && sh install_nni.sh && cd $NNI_SYS_DIR && sh install_nni.sh
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{4}' --nnimanager_ip '{5}' --nnimanager_port '{6}' && python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}'
--pai_hdfs_output_dir '{7}' --pai_hdfs_host '{8}' --pai_user_name {9}`; --pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10}`;
export const PAI_OUTPUT_DIR_FORMAT: string = export const PAI_OUTPUT_DIR_FORMAT: string =
`hdfs://{0}:9000/`; `hdfs://{0}:9000/`;
......
...@@ -20,7 +20,6 @@ ...@@ -20,7 +20,6 @@
'use strict'; 'use strict';
import * as request from 'request'; import * as request from 'request';
import { EventEmitter } from 'events';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { NNIError, NNIErrorNames } from '../../common/errors'; import { NNIError, NNIErrorNames } from '../../common/errors';
...@@ -43,7 +42,7 @@ export class PAIJobInfoCollector { ...@@ -43,7 +42,7 @@ export class PAIJobInfoCollector {
this.finalStatuses = ['SUCCEEDED', 'FAILED', 'USER_CANCELED', 'SYS_CANCELED']; this.finalStatuses = ['SUCCEEDED', 'FAILED', 'USER_CANCELED', 'SYS_CANCELED'];
} }
public async updateTrialStatusFromPAI(paiToken? : string, paiClusterConfig?: PAIClusterConfig) : Promise<void> { public async retrieveTrialStatus(paiToken? : string, paiClusterConfig?: PAIClusterConfig) : Promise<void> {
if (!paiClusterConfig || !paiToken) { if (!paiClusterConfig || !paiToken) {
return Promise.resolve(); return Promise.resolve();
} }
......
...@@ -19,29 +19,17 @@ ...@@ -19,29 +19,17 @@
'use strict'; 'use strict';
import * as assert from 'assert';
import { Request, Response, Router } from 'express';
import * as bodyParser from 'body-parser';
import * as component from '../../common/component'; import * as component from '../../common/component';
import { getBasePort } from '../../common/experimentStartupInfo';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { Inject } from 'typescript-ioc'; import { Inject } from 'typescript-ioc';
import { PAITrainingService } from './paiTrainingService'; import { PAITrainingService } from './paiTrainingService';
import { RestServer } from '../../common/restServer' import { ClusterJobRestServer } from '../common/clusterJobRestServer'
/** /**
* PAI Training service Rest server, provides rest API to support pai job metrics update * PAI Training service Rest server, provides rest API to support pai job metrics update
* *
*/ */
@component.Singleton @component.Singleton
export class PAIJobRestServer extends RestServer{ export class PAIJobRestServer extends ClusterJobRestServer{
/** NNI main rest service default port */
private static readonly DEFAULT_PORT: number = 51189;
private readonly API_ROOT_URL: string = '/api/v1/nni-pai';
private readonly expId: string = getExperimentId();
@Inject @Inject
private readonly paiTrainingService : PAITrainingService; private readonly paiTrainingService : PAITrainingService;
...@@ -50,61 +38,17 @@ export class PAIJobRestServer extends RestServer{ ...@@ -50,61 +38,17 @@ export class PAIJobRestServer extends RestServer{
*/ */
constructor() { constructor() {
super(); super();
const basePort: number = getBasePort();
assert(basePort && basePort > 1024);
this.port = basePort + 1; // PAIJobRestServer.DEFAULT_PORT;
this.paiTrainingService = component.get(PAITrainingService); this.paiTrainingService = component.get(PAITrainingService);
} }
public get paiRestServerPort(): number { protected handleTrialMetrics(jobId : string, metrics : any[]) : void {
if(!this.port) { // Split metrics array into single metric, then emit
throw new Error('PAI Rest server port is undefined'); // Warning: If not split metrics into single ones, the behavior will be UNKNOWN
for (const singleMetric of metrics) {
this.paiTrainingService.MetricsEmitter.emit('metric', {
id : jobId,
data : singleMetric
});
} }
return this.port;
}
/**
* NNIRestServer's own router registration
*/
protected registerRestHandler(): void {
this.app.use(bodyParser.json());
this.app.use(this.API_ROOT_URL, this.createRestHandler());
}
private createRestHandler() : Router {
const router: Router = Router();
// tslint:disable-next-line:typedef
router.use((req: Request, res: Response, next) => {
this.log.info(`${req.method}: ${req.url}: body:\n${JSON.stringify(req.body, undefined, 4)}`);
res.setHeader('Content-Type', 'application/json');
next();
});
router.post(`/update-metrics/${this.expId}/:trialId`, (req: Request, res: Response) => {
try {
this.log.info(`Get update-metrics request, trial job id is ${req.params.trialId}`);
this.log.info(`update-metrics body is ${JSON.stringify(req.body)}`);
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN
for (const singleMetric of req.body.metrics) {
this.paiTrainingService.MetricsEmitter.emit('metric', {
id : req.body.jobId,
data : singleMetric
});
}
res.send();
}
catch(err) {
this.log.error(`json parse metrics error: ${err}`);
res.status(500);
res.send(err.message);
}
});
return router;
} }
} }
\ No newline at end of file
...@@ -20,13 +20,13 @@ ...@@ -20,13 +20,13 @@
'use strict' 'use strict'
import * as assert from 'assert';
import * as component from '../../common/component'; import * as component from '../../common/component';
import * as cpp from 'child-process-promise'; import * as cpp from 'child-process-promise';
import * as fs from 'fs'; import * as fs from 'fs';
import * as path from 'path'; import * as path from 'path';
import * as request from 'request'; import * as request from 'request';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import { EventEmitter } from 'events'; import { EventEmitter } from 'events';
import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo'; import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo';
...@@ -40,7 +40,7 @@ import { ...@@ -40,7 +40,7 @@ import {
} from '../../common/trainingService'; } from '../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils'; import { delay, generateParamFileName, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils';
import { PAIJobRestServer } from './paiJobRestServer' import { PAIJobRestServer } from './paiJobRestServer'
import { PAITrialJobDetail, PAI_INSTALL_NNI_SHELL_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData'; import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData';
import { PAIJobInfoCollector } from './paiJobInfoCollector'; import { PAIJobInfoCollector } from './paiJobInfoCollector';
import { String } from 'typescript-string-operations'; import { String } from 'typescript-string-operations';
import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig'; import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig';
...@@ -67,7 +67,7 @@ class PAITrainingService implements TrainingService { ...@@ -67,7 +67,7 @@ class PAITrainingService implements TrainingService {
private readonly hdfsDirPattern: string; private readonly hdfsDirPattern: string;
private hdfsBaseDir: string | undefined; private hdfsBaseDir: string | undefined;
private hdfsOutputHost: string | undefined; private hdfsOutputHost: string | undefined;
private trialSequenceId: number; private nextTrialSequenceId: number;
private paiRestServerPort?: number; private paiRestServerPort?: number;
constructor() { constructor() {
...@@ -79,7 +79,7 @@ class PAITrainingService implements TrainingService { ...@@ -79,7 +79,7 @@ class PAITrainingService implements TrainingService {
this.experimentId = getExperimentId(); this.experimentId = getExperimentId();
this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap); this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap);
this.hdfsDirPattern = 'hdfs://(?<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?<baseDir>/.*)?'; this.hdfsDirPattern = 'hdfs://(?<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?<baseDir>/.*)?';
this.trialSequenceId = -1; this.nextTrialSequenceId = -1;
} }
public async run(): Promise<void> { public async run(): Promise<void> {
...@@ -87,7 +87,7 @@ class PAITrainingService implements TrainingService { ...@@ -87,7 +87,7 @@ class PAITrainingService implements TrainingService {
await restServer.start(); await restServer.start();
this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`); this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`);
while (!this.stopping) { while (!this.stopping) {
await this.paiJobCollector.updateTrialStatusFromPAI(this.paiToken, this.paiClusterConfig); await this.paiJobCollector.retrieveTrialStatus(this.paiToken, this.paiClusterConfig);
await delay(3000); await delay(3000);
} }
} }
...@@ -148,7 +148,7 @@ class PAITrainingService implements TrainingService { ...@@ -148,7 +148,7 @@ class PAITrainingService implements TrainingService {
if(!this.paiRestServerPort) { if(!this.paiRestServerPort) {
const restServer: PAIJobRestServer = component.get(PAIJobRestServer); const restServer: PAIJobRestServer = component.get(PAIJobRestServer);
this.paiRestServerPort = restServer.paiRestServerPort; this.paiRestServerPort = restServer.clusterRestServerPort;
} }
this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`);
...@@ -162,9 +162,8 @@ class PAITrainingService implements TrainingService { ...@@ -162,9 +162,8 @@ class PAITrainingService implements TrainingService {
//create tmp trial working folder locally. //create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`); await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${this.paiTrialConfig.codeDir} ${trialLocalTempFolder}`); await cpp.exec(`cp -r ${this.paiTrialConfig.codeDir} ${trialLocalTempFolder}`);
await cpp.exec(`mkdir -p ${path.join(trialLocalTempFolder, '.nni')}`);
const runScriptContent : string = PAI_INSTALL_NNI_SHELL_FORMAT; const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files // Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });
...@@ -173,7 +172,6 @@ class PAITrainingService implements TrainingService { ...@@ -173,7 +172,6 @@ class PAITrainingService implements TrainingService {
if(trialForm) { if(trialForm) {
await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)), await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)),
trialForm.hyperParameters.value, { encoding: 'utf8' }); trialForm.hyperParameters.value, { encoding: 'utf8' });
await fs.promises.writeFile(path.join(trialLocalTempFolder, '.nni', 'sequence_id'), trialSequenceId.toString(), { encoding: 'utf8' });
} }
// Step 1. Prepare PAI job configuration // Step 1. Prepare PAI job configuration
...@@ -204,6 +202,7 @@ class PAITrainingService implements TrainingService { ...@@ -204,6 +202,7 @@ class PAITrainingService implements TrainingService {
`$PWD/${trialJobId}/nnioutput`, `$PWD/${trialJobId}/nnioutput`,
trialJobId, trialJobId,
this.experimentId, this.experimentId,
trialSequenceId,
this.paiTrialConfig.command, this.paiTrialConfig.command,
getIPV4Address(), getIPV4Address(),
this.paiRestServerPort, this.paiRestServerPort,
...@@ -461,11 +460,11 @@ class PAITrainingService implements TrainingService { ...@@ -461,11 +460,11 @@ class PAITrainingService implements TrainingService {
} }
private generateSequenceId(): number { private generateSequenceId(): number {
if (this.trialSequenceId === -1) { if (this.nextTrialSequenceId === -1) {
this.trialSequenceId = getInitTrialSequenceId(); this.nextTrialSequenceId = getInitTrialSequenceId();
} }
return this.trialSequenceId++; return this.nextTrialSequenceId++;
} }
} }
......
...@@ -112,6 +112,7 @@ export const REMOTEMACHINE_RUN_SHELL_FORMAT: string = ...@@ -112,6 +112,7 @@ export const REMOTEMACHINE_RUN_SHELL_FORMAT: string =
`#!/bin/bash `#!/bin/bash
export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0} export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0}
export MULTI_PHASE={7} export MULTI_PHASE={7}
export NNI_TRIAL_SEQ_ID={8}
cd $NNI_SYS_DIR cd $NNI_SYS_DIR
echo $$ >{2} echo $$ >{2}
eval {3}{4} 2>{5} eval {3}{4} 2>{5}
......
...@@ -442,6 +442,11 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -442,6 +442,11 @@ class RemoteMachineTrainingService implements TrainingService {
// for lint // for lint
return; return;
} }
const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if (trialJobDetail === undefined) {
throw new Error(`Can not get trial job detail for job: ${trialJobId}`);
}
const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId); const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId);
await SSHClientUtility.remoteExeCommand(`mkdir -p ${trialWorkingFolder}`, sshClient); await SSHClientUtility.remoteExeCommand(`mkdir -p ${trialWorkingFolder}`, sshClient);
...@@ -463,7 +468,9 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -463,7 +468,9 @@ class RemoteMachineTrainingService implements TrainingService {
path.join(trialWorkingFolder, 'stderr'), path.join(trialWorkingFolder, 'stderr'),
path.join(trialWorkingFolder, '.nni', 'code'), path.join(trialWorkingFolder, '.nni', 'code'),
/** Mark if the trial is multi-phase job */ /** Mark if the trial is multi-phase job */
this.isMultiPhase); this.isMultiPhase,
trialJobDetail.sequenceId.toString()
);
//create tmp trial working folder locally. //create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.join(trialLocalTempFolder, '.nni')}`); await cpp.exec(`mkdir -p ${path.join(trialLocalTempFolder, '.nni')}`);
...@@ -475,7 +482,6 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -475,7 +482,6 @@ class RemoteMachineTrainingService implements TrainingService {
await SSHClientUtility.copyFileToRemote( await SSHClientUtility.copyFileToRemote(
path.join(trialLocalTempFolder, 'run.sh'), path.join(trialWorkingFolder, 'run.sh'), sshClient); path.join(trialLocalTempFolder, 'run.sh'), path.join(trialWorkingFolder, 'run.sh'), sshClient);
await this.writeParameterFile(trialJobId, form.hyperParameters, rmScheduleInfo.rmMeta); await this.writeParameterFile(trialJobId, form.hyperParameters, rmScheduleInfo.rmMeta);
await this.writeSequenceIdFile(trialJobId, rmScheduleInfo.rmMeta);
// Copy files in codeDir to remote working directory // Copy files in codeDir to remote working directory
await SSHClientUtility.copyDirectoryToRemote(this.trialConfig.codeDir, trialWorkingFolder, sshClient); await SSHClientUtility.copyDirectoryToRemote(this.trialConfig.codeDir, trialWorkingFolder, sshClient);
...@@ -614,15 +620,6 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -614,15 +620,6 @@ class RemoteMachineTrainingService implements TrainingService {
return this.trialSequenceId++; return this.trialSequenceId++;
} }
private async writeSequenceIdFile(trialJobId: string, rmMeta: RemoteMachineMeta): Promise<void> {
const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if (trialJobDetail === undefined) {
assert(false, `Can not get trial job detail for job: ${trialJobId}`);
} else {
await this.writeRemoteTrialFile(trialJobId, trialJobDetail.sequenceId.toString(), rmMeta, path.join('.nni', 'sequence_id'));
}
}
private async writeRemoteTrialFile(trialJobId: string, fileContent: string, private async writeRemoteTrialFile(trialJobId: string, fileContent: string,
rmMeta: RemoteMachineMeta, fileName: string): Promise<void> { rmMeta: RemoteMachineMeta, fileName: string): Promise<void> {
const sshClient: Client | undefined = this.machineSSHClientMap.get(rmMeta); const sshClient: Client | undefined = this.machineSSHClientMap.get(rmMeta);
......
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as chai from 'chai';
import * as chaiAsPromised from 'chai-as-promised';
import * as fs from 'fs';
import * as tmp from 'tmp';
import * as component from '../../common/component';
import { cleanupUnitTest, prepareUnitTest } from '../../common/utils';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { KubeflowTrainingService } from '../kubeflow/kubeflowTrainingService';
// TODO: copy mockedTrail.py to local folder
const localCodeDir: string = tmp.dirSync().name
const mockedTrialPath: string = './training_service/test/mockedTrial.py'
fs.copyFileSync(mockedTrialPath, localCodeDir + '/mockedTrial.py')
describe('Unit Test for KubeflowTrainingService', () => {
let skip: boolean = false;
let testKubeflowConfig: any;
let testKubeflowTrialConfig : any;
try {
testKubeflowConfig = JSON.parse(fs.readFileSync('../../.vscode/kubeflowCluster.json', 'utf8'));
testKubeflowTrialConfig = `{\"command\":\"python3 mnist-keras.py\",\"codeDir\":\"/home/desy/nni/examples/trials/mnist",\"gpuNum\":\"1\",\"cpuNum\":\"2\",\"memoryMB\":\"8196\",\"image\":\"msranni/nni:v0.3.3\"}`;
} catch (err) {
console.log('Please configure rminfo.json to enable remote machine unit test.');
//skip = true;
}
let kubeflowTrainingService: KubeflowTrainingService;
console.log(tmp.dirSync().name);
before(() => {
chai.should();
chai.use(chaiAsPromised);
prepareUnitTest();
});
after(() => {
cleanupUnitTest();
});
beforeEach(() => {
if (skip) {
return;
}
kubeflowTrainingService = component.get(KubeflowTrainingService);
});
afterEach(() => {
if (skip) {
return;
}
kubeflowTrainingService.cleanUp();
});
it('Simple Test', async () => {
if (skip) {
return;
}
kubeflowTrainingService.setClusterMetadata(TrialConfigMetadataKey.KUBEFLOW_CLUSTER_CONFIG, testKubeflowConfig),
kubeflowTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, testKubeflowTrialConfig);
try {
const trialDetail = await kubeflowTrainingService.submitTrialJob({jobType : 'TRIAL'});
} catch(error) {
console.log('Submit job failed:' + error);
chai.assert(error)
}
});
});
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment