Unverified Commit 806afeb6 authored by fishyds's avatar fishyds Committed by GitHub
Browse files

[Kubeflow Training Service] V1, merge from kubeflow branch to master branch (#382)

* Kubeflow TrainingService support, v1 (#373)

1. Create new Training Service: kubeflow trainning service, use 'kubectl' and kubeflow tfjobs CRD to submit and manage jobs
2. Update nni python SDK to support new kubeflow platform
3. Update nni python SDK's get_sequende_id() implementation, read NNI_TRIAL_SEQ_ID env variable, instead of reading .nni/sequence_id file
4. This version only supports Tensorflow operator. Will add more operators' support in future versions
parent b749266d
......@@ -36,8 +36,8 @@ import { LocalTrainingServiceForGPU } from './training_service/local/localTraini
import {
RemoteMachineTrainingService
} from './training_service/remote_machine/remoteMachineTrainingService';
import { PAITrainingService } from './training_service/pai/paiTrainingService'
import { PAITrainingService } from './training_service/pai/paiTrainingService';
import { KubeflowTrainingService } from './training_service/kubeflow/kubeflowTrainingService';
function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number) {
const createNew: boolean = (startExpMode === 'new');
......@@ -52,6 +52,8 @@ async function initContainer(platformMode: string): Promise<void> {
Container.bind(TrainingService).to(RemoteMachineTrainingService).scope(Scope.Singleton);
} else if (platformMode === 'pai') {
Container.bind(TrainingService).to(PAITrainingService).scope(Scope.Singleton);
} else if (platformMode === 'kubeflow') {
Container.bind(TrainingService).to(KubeflowTrainingService).scope(Scope.Singleton);
} else {
throw new Error(`Error: unsupported mode: ${mode}`);
}
......@@ -76,19 +78,22 @@ if (!strPort || strPort.length === 0) {
const port: number = parseInt(strPort, 10);
const mode: string = parseArg(['--mode', '-m']);
if (!['local', 'remote', 'pai'].includes(mode)) {
if (!['local', 'remote', 'pai', 'kubeflow'].includes(mode)) {
console.log(`FATAL: unknown mode: ${mode}`);
usage();
process.exit(1);
}
const startMode: string = parseArg(['--start_mode', '-s']);
if (!['new', 'resume'].includes(startMode)) {
console.log(`FATAL: unknown start_mode: ${startMode}`);
usage();
process.exit(1);
}
const experimentId: string = parseArg(['--experiment_id', '-id']);
if (startMode === 'resume' && experimentId.trim().length < 1) {
console.log(`FATAL: cannot resume experiment, invalid experiment_id: ${experimentId}`);
usage();
process.exit(1);
}
......
......@@ -15,6 +15,7 @@
"express": "^4.16.3",
"express-joi-validator": "^2.0.0",
"node-nvidia-smi": "^1.0.0",
"node-yaml": "^3.1.1",
"rx": "^4.1.0",
"sqlite3": "^4.0.2",
"ssh2": "^0.6.1",
......
......@@ -89,9 +89,7 @@ class NNIRestHandler {
return router;
}
private handle_error(err: Error, res: Response): void {
this.log.info(err);
private handle_error(err: Error, res: Response, isFatal: boolean = false): void {
if (err instanceof NNIError && err.name === NNIErrorNames.NOT_FOUND) {
res.status(404);
} else {
......@@ -100,6 +98,14 @@ class NNIRestHandler {
res.send({
error: err.message
});
// If it's a fatal error, exit process
if(isFatal) {
this.log.critical(err);
process.exit(1);
}
this.log.error(err);
}
// TODO add validators for request params, query, body
......@@ -145,12 +151,14 @@ class NNIRestHandler {
experiment_id: eid
});
}).catch((err: Error) => {
// Start experiment is a step of initialization, so any exception thrown is a fatal
this.handle_error(err, res);
});
} else {
this.nniManager.resumeExperiment().then(() => {
res.send();
}).catch((err: Error) => {
// Resume experiment is a step of initialization, so any exception thrown is a fatal
this.handle_error(err, res);
});
}
......@@ -180,7 +188,8 @@ class NNIRestHandler {
}
res.send();
} catch (err) {
this.handle_error(err, res);
// setClusterMetata is a step of initialization, so any exception thrown is a fatal
this.handle_error(err, res, true);
}
});
}
......
......@@ -46,6 +46,14 @@ export namespace ValidationSchemas {
userName: joi.string().min(1).required(),
passWord: joi.string().min(1).required(),
host: joi.string().min(1).required()
}),
kubeflow_config: joi.object({
operator: joi.string().min(1).required(),
nfs: joi.object({
server: joi.string().min(1).required(),
path: joi.string().min(1).required()
}).required(),
kubernetesServer: joi.string().min(1).required()
})
}
};
......
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as assert from 'assert';
import { Request, Response, Router } from 'express';
import * as bodyParser from 'body-parser';
import * as component from '../../common/component';
import { getBasePort, getExperimentId } from '../../common/experimentStartupInfo';
import { RestServer } from '../../common/restServer'
/**
* Cluster Job Training service Rest server, provides rest API to support Cluster job metrics update
*
*/
@component.Singleton
export abstract class ClusterJobRestServer extends RestServer{
private readonly API_ROOT_URL: string = '/api/v1/nni-pai';
private readonly expId: string = getExperimentId();
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
constructor() {
super();
const basePort: number = getBasePort();
assert(basePort && basePort > 1024);
this.port = basePort + 1;
}
public get clusterRestServerPort(): number {
if(!this.port) {
throw new Error('PAI Rest server port is undefined');
}
return this.port;
}
/**
* NNIRestServer's own router registration
*/
protected registerRestHandler(): void {
this.app.use(bodyParser.json());
this.app.use(this.API_ROOT_URL, this.createRestHandler());
}
private createRestHandler() : Router {
const router: Router = Router();
// tslint:disable-next-line:typedef
router.use((req: Request, res: Response, next) => {
this.log.info(`${req.method}: ${req.url}: body:\n${JSON.stringify(req.body, undefined, 4)}`);
res.setHeader('Content-Type', 'application/json');
next();
});
router.post(`/update-metrics/${this.expId}/:trialId`, (req: Request, res: Response) => {
try {
this.log.info(`Get update-metrics request, trial job id is ${req.params.trialId}`);
this.log.info(`update-metrics body is ${JSON.stringify(req.body)}`);
this.handleTrialMetrics(req.body.jobId, req.body.metrics);
res.send();
}
catch(err) {
this.log.error(`json parse metrics error: ${err}`);
res.status(500);
res.send(err.message);
}
});
return router;
}
/** Abstract method to handle trial metrics data */
protected abstract handleTrialMetrics(jobId : string, trialMetrics : any[]) : void;
}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
export const CONTAINER_INSTALL_NNI_SHELL_FORMAT: string =
`#!/bin/bash
if python3 -c 'import nni' > /dev/null 2>&1; then
# nni module is already installed, skip
return
else
# Install nni
python3 -m pip install --user --upgrade nni
fi`;
\ No newline at end of file
......@@ -28,5 +28,6 @@ export enum TrialConfigMetadataKey {
EXPERIMENT_ID = 'experimentId',
MULTI_PHASE = 'multiPhase',
RANDOM_SCHEDULER = 'random_scheduler',
PAI_CLUSTER_CONFIG = 'pai_config'
PAI_CLUSTER_CONFIG = 'pai_config',
KUBEFLOW_CLUSTER_CONFIG = 'kubeflow_config'
}
import { TrialConfig } from "../common/trialConfig";
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
/** operator types that kubeflow supported */
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' | 'mxnet-operator' | 'caffe2-operator' | 'chainer-operator' | 'mpi-operator';
export type KubeflowOperatorPlural = 'tfjobs' | 'pytorchjobs' | 'mxjobs' | 'caffe2jobs' | 'chainerjobs' | 'mpijobs';
/**
* map from Kubeflow operator name to its plural name in K8S
*/
export const kubeflowOperatorMap : Map<KubeflowOperator, KubeflowOperatorPlural> = new Map<KubeflowOperator, KubeflowOperatorPlural>([
['tf-operator' , 'tfjobs'],
['pytorch-operator', 'pytorchjobs'],
['mxnet-operator', 'mxjobs'],
['caffe2-operator', 'caffe2jobs'],
['chainer-operator', 'chainerjobs'],
['mpi-operator', 'mpijobs']
]);
/**
* Kuberflow cluster configuration
*
*/
export class KubeflowClusterConfig {
/** Name of Kubeflow operator, like tf-operator */
public readonly operator: KubeflowOperator;
public readonly nfs: NFSConfig;
public readonly kubernetesServer: string;
/**
* Constructor
* @param userName User name of Kubeflow Cluster
* @param passWord password of Kubeflow Cluster
* @param host Host IP of Kubeflow Cluster
*/
constructor(operator: KubeflowOperator, nfs : NFSConfig, kubernetesServer : string) {
this.operator = operator;
this.nfs = nfs;
this.kubernetesServer = kubernetesServer;
}
}
/**
* NFS configuration to store Kubeflow job related files
*/
export class NFSConfig {
/** IP Adress of NFS server */
public readonly server : string;
/** exported NFS path on NFS server */
public readonly path : string;
constructor(server : string, path : string) {
this.server = server;
this.path = path;
}
}
/**
* Trial job configuration for Kubeflow
*/
export class KubeflowTrialConfig extends TrialConfig {
public readonly cpuNum: number;
public readonly memoryMB: number;
public readonly image: string;
constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, image: string) {
super(command, codeDir, gpuNum);
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
this.image = image;
}
}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
/**
* KubeflowTrialJobDetail
*/
// tslint:disable-next-line:max-classes-per-file
export class KubeflowTrialJobDetail implements TrialJobDetail {
public id: string;
public status: TrialJobStatus;
public submitTime: number;
public startTime?: number;
public endTime?: number;
public tags?: string[];
public url?: string;
public workingDirectory: string;
public form: JobApplicationForm;
public kubeflowJobName: string;
public sequenceId: number;
public queryJobFailedCount: number;
public k8sPluralName: string
constructor(id: string, status: TrialJobStatus, submitTime: number,
workingDirectory: string, form: JobApplicationForm,
kubeflowJobName: string, sequenceId: number, url: string, k8sPluralName: string) {
this.id = id;
this.status = status;
this.submitTime = submitTime;
this.workingDirectory = workingDirectory;
this.form = form;
this.kubeflowJobName = kubeflowJobName;
this.sequenceId = sequenceId;
this.tags = [];
this.queryJobFailedCount = 0;
this.url = url;
this.k8sPluralName = k8sPluralName;
}
}
export const KUBEFLOW_RUN_SHELL_FORMAT: string =
`#!/bin/bash
export NNI_PLATFORM=kubeflow
export NNI_SYS_DIR={0}
export NNI_OUTPUT_DIR={1}
export MULTI_PHASE=false
export NNI_TRIAL_JOB_ID={2}
export NNI_EXP_ID={3}
export NNI_CODE_DIR={4}
export NNI_TRIAL_SEQ_ID={5}
mkdir -p $NNI_SYS_DIR
mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh # Check and install NNI pkg
python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR//trialkeeper_stderr
`
export type KubeflowTFJobType = 'Created' | 'Running' | 'Failed' | 'Succeeded';
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as cpp from 'child-process-promise';
import { getLogger, Logger } from '../../common/log';
import { KubeflowTrialJobDetail, KubeflowTFJobType} from './kubeflowData';
import { NNIError, NNIErrorNames } from '../../common/errors';
import { TrialJobStatus } from '../../common/trainingService';
/**
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
*/
export class KubeflowJobInfoCollector {
private readonly trialJobsMap : Map<string, KubeflowTrialJobDetail>;
private readonly log: Logger = getLogger();
private readonly statusesNeedToCheck: TrialJobStatus[];
private readonly MAX_FAILED_QUERY_JOB_NUMBER: number = 30;
constructor(jobMap: Map<string, KubeflowTrialJobDetail>) {
this.trialJobsMap = jobMap;
this.statusesNeedToCheck = ['RUNNING', 'WAITING'];
}
public async retrieveTrialStatus() : Promise<void> {
const updateKubeflowTrialJobs : Promise<void>[] = [];
for(let [trialJobId, kubeflowTrialJob] of this.trialJobsMap) {
if (!kubeflowTrialJob) {
throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`);
}
// Since Kubeflow needs some delay to schedule jobs, we provide 20 seconds buffer time to check kubeflow job's status
if( Date.now() - kubeflowTrialJob.submitTime < 20 * 1000) {
return Promise.resolve();
}
updateKubeflowTrialJobs.push(this.retrieveSingleTrialJobInfo(kubeflowTrialJob))
}
await Promise.all(updateKubeflowTrialJobs);
}
private async retrieveSingleTrialJobInfo(kubeflowTrialJob : KubeflowTrialJobDetail) : Promise<void> {
if (!this.statusesNeedToCheck.includes(kubeflowTrialJob.status)) {
return Promise.resolve();
}
let result : cpp.childProcessPromise.Result;
try {
result = await cpp.exec(`kubectl get ${kubeflowTrialJob.k8sPluralName} ${kubeflowTrialJob.kubeflowJobName} -o json`);
if(result.stderr) {
this.log.error(`Get ${kubeflowTrialJob.k8sPluralName} ${kubeflowTrialJob.kubeflowJobName} failed. Error is ${result.stderr}, failed checking number is ${kubeflowTrialJob.queryJobFailedCount}`);
kubeflowTrialJob.queryJobFailedCount++;
if(kubeflowTrialJob.queryJobFailedCount >= this.MAX_FAILED_QUERY_JOB_NUMBER) {
kubeflowTrialJob.status = 'UNKNOWN';
}
}
} catch(error) {
this.log.error(`kubectl get ${kubeflowTrialJob.k8sPluralName} ${kubeflowTrialJob.kubeflowJobName} failed, error is ${error}`);
return Promise.resolve();
}
const kubeflowJobInfo = JSON.parse(result.stdout);
if(kubeflowJobInfo.status && kubeflowJobInfo.status.conditions) {
const latestCondition = kubeflowJobInfo.status.conditions[kubeflowJobInfo.status.conditions.length - 1];
const tfJobType : KubeflowTFJobType = <KubeflowTFJobType>latestCondition.type;
switch(tfJobType) {
case 'Created':
kubeflowTrialJob.status = 'WAITING';
kubeflowTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
case 'Running':
kubeflowTrialJob.status = 'RUNNING';
if(!kubeflowTrialJob.startTime) {
kubeflowTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
}
break;
case 'Failed':
kubeflowTrialJob.status = 'FAILED';
kubeflowTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
case 'Succeeded':
kubeflowTrialJob.status = 'SUCCEEDED';
kubeflowTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
default:
break;
}
}
return Promise.resolve();
}
}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as component from '../../common/component';
import { Inject } from 'typescript-ioc';
import { KubeflowTrainingService } from './kubeflowTrainingService';
import { ClusterJobRestServer } from '../common/clusterJobRestServer'
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*/
@component.Singleton
export class KubeflowJobRestServer extends ClusterJobRestServer{
@Inject
private readonly kubeflowTrainingService : KubeflowTrainingService;
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
constructor() {
super();
this.kubeflowTrainingService = component.get(KubeflowTrainingService);
}
protected handleTrialMetrics(jobId : string, metrics : any[]) : void {
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN
for (const singleMetric of metrics) {
this.kubeflowTrainingService.MetricsEmitter.emit('metric', {
id : jobId,
data : singleMetric
});
}
}
}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict'
import * as assert from 'assert';
import * as component from '../../common/component';
import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import * as path from 'path';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { EventEmitter } from 'events';
import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import { MethodNotImplementedError } from '../../common/errors';
import { String } from 'typescript-string-operations';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import {
JobApplicationForm, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric
} from '../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils';
import { KubeflowClusterConfig, kubeflowOperatorMap, KubeflowTrialConfig, NFSConfig } from './kubeflowConfig';
import { KubeflowTrialJobDetail, KUBEFLOW_RUN_SHELL_FORMAT } from './kubeflowData';
import { KubeflowJobRestServer } from './kubeflowJobRestServer';
import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector';
var yaml = require('node-yaml');
/**
* Training Service implementation for Kubeflow
* Refer https://github.com/kubeflow/kubeflow for more info about Kubeflow
*/
@component.Singleton
class KubeflowTrainingService implements TrainingService {
private readonly NNI_KUBEFLOW_TRIAL_LABEL = 'nni-kubeflow-trial';
private readonly log!: Logger;
private readonly metricsEmitter: EventEmitter;
private readonly trialJobsMap: Map<string, KubeflowTrialJobDetail>;
/** experiment root dir in NFS */
private readonly trialLocalNFSTempFolder: string;
private stopping: boolean = false;
private experimentId! : string;
private nextTrialSequenceId: number;
private kubeflowClusterConfig?: KubeflowClusterConfig;
private kubeflowTrialConfig?: KubeflowTrialConfig;
private kubeflowJobInfoCollector: KubeflowJobInfoCollector;
private kubeflowRestServerPort?: number;
private kubeflowJobPlural?: string;
private readonly CONTAINER_MOUNT_PATH: string;
constructor() {
this.log = getLogger();
this.metricsEmitter = new EventEmitter();
this.trialJobsMap = new Map<string, KubeflowTrialJobDetail>();
this.kubeflowJobInfoCollector = new KubeflowJobInfoCollector(this.trialJobsMap);
this.trialLocalNFSTempFolder = path.join(getExperimentRootDir(), 'trials-nfs-tmp');
this.experimentId = getExperimentId();
this.nextTrialSequenceId = -1;
this.CONTAINER_MOUNT_PATH = '/tmp/nfs';
}
public async run(): Promise<void> {
const restServer: KubeflowJobRestServer = component.get(KubeflowJobRestServer);
await restServer.start();
this.log.info(`Kubeflow Training service rest server listening on: ${restServer.endPoint}`);
while (!this.stopping) {
// collect metrics by calling 'kubectl get' command on Kubeflow jobs
await delay(3000);
await this.kubeflowJobInfoCollector.retrieveTrialStatus();
}
}
public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> {
if(!this.kubeflowClusterConfig) {
throw new Error('Kubeflow Cluster config is not initialized');
}
if(!this.kubeflowTrialConfig) {
throw new Error('Kubeflow trial config is not initialized');
}
if(!this.kubeflowJobPlural) {
throw new Error('Kubeflow job plural name is undefined');
}
if(!this.kubeflowRestServerPort) {
const restServer: KubeflowJobRestServer = component.get(KubeflowJobRestServer);
this.kubeflowRestServerPort = restServer.clusterRestServerPort;
}
const trialJobId: string = uniqueString(5);
const curTrialSequenceId: number = this.generateSequenceId();
// Set trial's NFS working folder
const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId);
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${this.kubeflowTrialConfig.codeDir} ${trialLocalTempFolder}`);
const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });
const kubeflowRunScriptContent: string = String.Format(
KUBEFLOW_RUN_SHELL_FORMAT,
`$PWD/nni/${trialJobId}`,
path.join(trialWorkingFolder, 'output'),
trialJobId,
getExperimentId(),
trialWorkingFolder,
curTrialSequenceId,
this.kubeflowTrialConfig.command,
getIPV4Address(),
this.kubeflowRestServerPort
);
//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
// Write file content ( run.sh and parameter.cfg ) to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run.sh'), kubeflowRunScriptContent, { encoding: 'utf8' });
// Write file content ( parameter.cfg ) to local tmp folders
const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form)
if(trialForm && trialForm.hyperParameters) {
await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)),
trialForm.hyperParameters.value, { encoding: 'utf8' });
}
const kubeflowJobYamlPath = path.join(trialLocalTempFolder, `kubeflow-job-${trialJobId}.yaml`);
const kubeflowJobName = `nni-exp-${this.experimentId}-trial-${trialJobId}`.toLowerCase();
const podResources : any = {};
podResources.requests = {
'memory': `${this.kubeflowTrialConfig.memoryMB}Mi`,
'cpu': `${this.kubeflowTrialConfig.cpuNum}`,
'nvidia.com/gpu': `${this.kubeflowTrialConfig.gpuNum}`
}
podResources.limits = Object.assign({}, podResources.requests);
// Generate kubeflow job resource yaml file for K8S
yaml.write(
kubeflowJobYamlPath,
this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, podResources),
'utf-8'
);
// Creat work dir for current trial in NFS directory
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`);
// Copy code files from local dir to NFS mounted dir
await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
const nfsConfig: NFSConfig = this.kubeflowClusterConfig.nfs;
const trialJobDetail: KubeflowTrialJobDetail = new KubeflowTrialJobDetail(
trialJobId,
'WAITING',
Date.now(),
trialWorkingFolder,
form,
kubeflowJobName,
curTrialSequenceId,
`nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`,
this.kubeflowJobPlural
);
// Create kubeflow training jobs
await cpp.exec(`kubectl create -f ${kubeflowJobYamlPath}`);
// Set trial job detail until kubectl create resource successfully
this.trialJobsMap.set(trialJobId, trialJobDetail);
return Promise.resolve(trialJobDetail);
}
public updateTrialJob(trialJobId: string, form: JobApplicationForm): Promise<TrialJobDetail> {
throw new MethodNotImplementedError();
}
public listTrialJobs(): Promise<TrialJobDetail[]> {
throw new MethodNotImplementedError();
}
public getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
if(!this.kubeflowClusterConfig) {
throw new Error('Kubeflow Cluster config is not initialized');
}
const kubeflowTrialJob: TrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if (!kubeflowTrialJob) {
return Promise.reject(`trial job ${trialJobId} not found`)
}
return Promise.resolve(kubeflowTrialJob);
}
public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void) {
this.metricsEmitter.on('metric', listener);
}
public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void) {
this.metricsEmitter.off('metric', listener);
}
public get isMultiPhaseJobSupported(): boolean {
return false;
}
public async cancelTrialJob(trialJobId: string): Promise<void> {
const trialJobDetail : KubeflowTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if(!trialJobDetail) {
const errorMessage: string = `CancelTrialJob: trial job id ${trialJobId} not found`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
if(!this.kubeflowJobPlural) {
const errorMessage: string = `CancelTrialJob: trial job id ${trialJobId} failed because kubeflowJobPlural is undefined`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
const result: cpp.childProcessPromise.Result = await cpp.exec(`kubectl delete ${this.kubeflowJobPlural} -l app=${this.NNI_KUBEFLOW_TRIAL_LABEL},expId=${getExperimentId()},trialId=${trialJobId}`);
if(result.stderr) {
const errorMessage: string = `kubectl delete ${this.kubeflowJobPlural} for trial ${trialJobId} failed: ${result.stderr}`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
trialJobDetail.endTime = Date.now();
trialJobDetail.status = 'USER_CANCELED';
return Promise.resolve();
}
public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) {
case TrialConfigMetadataKey.KUBEFLOW_CLUSTER_CONFIG:
this.kubeflowClusterConfig = <KubeflowClusterConfig>JSON.parse(value);
// If NFS config section is valid in config file, proceed to mount and config NFS
if(this.kubeflowClusterConfig.nfs) {
//Check and mount NFS mount point here
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}`);
const nfsServer: string = this.kubeflowClusterConfig.nfs.server;
const nfsPath: string = this.kubeflowClusterConfig.nfs.path;
try {
await cpp.exec(`sudo mount ${nfsServer}:${nfsPath} ${this.trialLocalNFSTempFolder}`);
} catch(error) {
const mountError: string = `Mount NFS ${nfsServer}:${nfsPath} to ${this.trialLocalNFSTempFolder} failed, error is ${error}`;
this.log.error(mountError);
throw new Error(mountError);
}
}
this.kubeflowJobPlural = kubeflowOperatorMap.get(this.kubeflowClusterConfig.operator);
break;
case TrialConfigMetadataKey.TRIAL_CONFIG:
if (!this.kubeflowClusterConfig){
this.log.error('kubeflow cluster config is not initialized');
return Promise.reject(new Error('kubeflow cluster config is not initialized'));
}
this.kubeflowTrialConfig = <KubeflowTrialConfig>JSON.parse(value);
break;
default:
break;
}
return Promise.resolve();
}
public getClusterMetadata(key: string): Promise<string> {
throw new MethodNotImplementedError();
}
public async cleanUp(): Promise<void> {
this.stopping = true;
// First, cancel all running kubeflow jobs
for(let [trialJobId, kubeflowTrialJob] of this.trialJobsMap) {
if(['RUNNING', 'WAITING', 'UNKNOWN'].includes(kubeflowTrialJob.status)) {
try {
await this.cancelTrialJob(trialJobId);
} catch(error) {} // DONT throw error during cleanup
kubeflowTrialJob.status = 'SYS_CANCELED';
}
}
assert(this.kubeflowJobPlural !== undefined);
// Delete all kubeflow jobs whose expId label is current experiment id
try {
await cpp.exec(`kubectl delete ${this.kubeflowJobPlural} -l app=${this.NNI_KUBEFLOW_TRIAL_LABEL},expId=${getExperimentId()}`);
} catch(error) {
this.log.error(`Delete ${this.kubeflowJobPlural} with label: app=${this.NNI_KUBEFLOW_TRIAL_LABEL},expId=${getExperimentId()} failed, error is ${error}`);
}
// Unmount NFS
try {
await cpp.exec(`sudo umount ${this.trialLocalNFSTempFolder}`);
} catch(error) {
this.log.error(`Unmount ${this.trialLocalNFSTempFolder} failed, error is ${error}`);
}
// Stop Kubeflow rest server
const restServer: KubeflowJobRestServer = component.get(KubeflowJobRestServer);
try {
await restServer.stop();
this.log.info('Kubeflow Training service rest server stopped successfully.');
} catch (error) {
this.log.error(`Kubeflow Training service rest server stopped failed, error: ${error.message}`);
Promise.reject(error);
}
return Promise.resolve();
}
public get MetricsEmitter() : EventEmitter {
return this.metricsEmitter;
}
private generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName : string, podResources : any) : any {
if(!this.kubeflowClusterConfig) {
throw new Error('Kubeflow Cluster config is not initialized');
}
if(!this.kubeflowTrialConfig) {
throw new Error('Kubeflow trial config is not initialized');
}
return {
apiVersion: 'kubeflow.org/v1alpha2',
kind: 'TFJob',
metadata: {
name: kubeflowJobName,
namespace: 'default',
labels: {
app: this.NNI_KUBEFLOW_TRIAL_LABEL,
expId: getExperimentId(),
trialId: trialJobId
}
},
spec: {
tfReplicaSpecs: {
Worker: {
replicas: 1,
template: {
metadata: {
creationTimestamp: null
},
spec: {
containers: [
{
// Kubeflow tensorflow operator requires that containers' name must be tensorflow
// TODO: change the name based on operator's type
name: 'tensorflow',
image: this.kubeflowTrialConfig.image,
args: ["sh", `${path.join(trialWorkingFolder, 'run.sh')}`],
volumeMounts: [{
name: 'nni-nfs-vol',
mountPath: this.CONTAINER_MOUNT_PATH
}],
resources: podResources//,
//workingDir: '/tmp/nni/nuDEP'
}],
restartPolicy: 'ExitCode',
volumes: [{
name: 'nni-nfs-vol',
nfs: {
server: `${this.kubeflowClusterConfig.nfs.server}`,
path: `${this.kubeflowClusterConfig.nfs.path}`
}
}]
}
}
}
}
}
};
}
private generateSequenceId(): number {
if (this.nextTrialSequenceId === -1) {
this.nextTrialSequenceId = getInitTrialSequenceId();
}
return this.nextTrialSequenceId++;
}
}
export { KubeflowTrainingService }
\ No newline at end of file
......@@ -320,6 +320,7 @@ class LocalTrainingService implements TrainingService {
{ key: 'NNI_SYS_DIR', value: trialJobDetail.workingDirectory },
{ key: 'NNI_TRIAL_JOB_ID', value: trialJobDetail.id },
{ key: 'NNI_OUTPUT_DIR', value: trialJobDetail.workingDirectory },
{ key: 'NNI_TRIAL_SEQ_ID', value: trialJobDetail.sequenceId.toString() },
{ key: 'MULTI_PHASE', value: this.isMultiPhase.toString() }
];
}
......@@ -368,7 +369,6 @@ class LocalTrainingService implements TrainingService {
await cpp.exec(`touch ${path.join(trialJobDetail.workingDirectory, '.nni', 'metrics')}`);
await fs.promises.writeFile(path.join(trialJobDetail.workingDirectory, 'run.sh'), runScriptLines.join('\n'), { encoding: 'utf8', mode: 0o777 });
await this.writeParameterFile(trialJobDetail.workingDirectory, (<TrialJobApplicationForm>trialJobDetail.form).hyperParameters);
await this.writeSequenceIdFile(trialJobId);
const process: cp.ChildProcess = cp.exec(`bash ${path.join(trialJobDetail.workingDirectory, 'run.sh')}`);
this.setTrialJobStatus(trialJobDetail, 'RUNNING');
......@@ -450,13 +450,6 @@ class LocalTrainingService implements TrainingService {
return this.trialSequenceId++;
}
private async writeSequenceIdFile(trialJobId: string): Promise<void> {
const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId);
assert(trialJobDetail !== undefined);
const filepath: string = path.join(trialJobDetail.workingDirectory, '.nni', 'sequence_id');
await fs.promises.writeFile(filepath, trialJobDetail.sequenceId.toString(), { encoding: 'utf8' });
}
}
export { LocalTrainingService };
......@@ -60,10 +60,10 @@ else
fi`;
export const PAI_TRIAL_COMMAND_FORMAT: string =
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3}
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4}
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{4}' --nnimanager_ip '{5}' --nnimanager_port '{6}'
--pai_hdfs_output_dir '{7}' --pai_hdfs_host '{8}' --pai_user_name {9}`;
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}'
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10}`;
export const PAI_OUTPUT_DIR_FORMAT: string =
`hdfs://{0}:9000/`;
......
......@@ -20,7 +20,6 @@
'use strict';
import * as request from 'request';
import { EventEmitter } from 'events';
import { Deferred } from 'ts-deferred';
import { getLogger, Logger } from '../../common/log';
import { NNIError, NNIErrorNames } from '../../common/errors';
......@@ -43,7 +42,7 @@ export class PAIJobInfoCollector {
this.finalStatuses = ['SUCCEEDED', 'FAILED', 'USER_CANCELED', 'SYS_CANCELED'];
}
public async updateTrialStatusFromPAI(paiToken? : string, paiClusterConfig?: PAIClusterConfig) : Promise<void> {
public async retrieveTrialStatus(paiToken? : string, paiClusterConfig?: PAIClusterConfig) : Promise<void> {
if (!paiClusterConfig || !paiToken) {
return Promise.resolve();
}
......
......@@ -19,29 +19,17 @@
'use strict';
import * as assert from 'assert';
import { Request, Response, Router } from 'express';
import * as bodyParser from 'body-parser';
import * as component from '../../common/component';
import { getBasePort } from '../../common/experimentStartupInfo';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { Inject } from 'typescript-ioc';
import { PAITrainingService } from './paiTrainingService';
import { RestServer } from '../../common/restServer'
import { ClusterJobRestServer } from '../common/clusterJobRestServer'
/**
* PAI Training service Rest server, provides rest API to support pai job metrics update
*
*/
@component.Singleton
export class PAIJobRestServer extends RestServer{
/** NNI main rest service default port */
private static readonly DEFAULT_PORT: number = 51189;
private readonly API_ROOT_URL: string = '/api/v1/nni-pai';
private readonly expId: string = getExperimentId();
export class PAIJobRestServer extends ClusterJobRestServer{
@Inject
private readonly paiTrainingService : PAITrainingService;
......@@ -50,61 +38,17 @@ export class PAIJobRestServer extends RestServer{
*/
constructor() {
super();
const basePort: number = getBasePort();
assert(basePort && basePort > 1024);
this.port = basePort + 1; // PAIJobRestServer.DEFAULT_PORT;
this.paiTrainingService = component.get(PAITrainingService);
}
public get paiRestServerPort(): number {
if(!this.port) {
throw new Error('PAI Rest server port is undefined');
}
return this.port;
}
/**
* NNIRestServer's own router registration
*/
protected registerRestHandler(): void {
this.app.use(bodyParser.json());
this.app.use(this.API_ROOT_URL, this.createRestHandler());
}
private createRestHandler() : Router {
const router: Router = Router();
// tslint:disable-next-line:typedef
router.use((req: Request, res: Response, next) => {
this.log.info(`${req.method}: ${req.url}: body:\n${JSON.stringify(req.body, undefined, 4)}`);
res.setHeader('Content-Type', 'application/json');
next();
});
router.post(`/update-metrics/${this.expId}/:trialId`, (req: Request, res: Response) => {
try {
this.log.info(`Get update-metrics request, trial job id is ${req.params.trialId}`);
this.log.info(`update-metrics body is ${JSON.stringify(req.body)}`);
protected handleTrialMetrics(jobId : string, metrics : any[]) : void {
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN
for (const singleMetric of req.body.metrics) {
for (const singleMetric of metrics) {
this.paiTrainingService.MetricsEmitter.emit('metric', {
id : req.body.jobId,
id : jobId,
data : singleMetric
});
}
res.send();
}
catch(err) {
this.log.error(`json parse metrics error: ${err}`);
res.status(500);
res.send(err.message);
}
});
return router;
}
}
\ No newline at end of file
......@@ -20,13 +20,13 @@
'use strict'
import * as assert from 'assert';
import * as component from '../../common/component';
import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import * as path from 'path';
import * as request from 'request';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { Deferred } from 'ts-deferred';
import { EventEmitter } from 'events';
import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo';
......@@ -40,7 +40,7 @@ import {
} from '../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils';
import { PAIJobRestServer } from './paiJobRestServer'
import { PAITrialJobDetail, PAI_INSTALL_NNI_SHELL_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData';
import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData';
import { PAIJobInfoCollector } from './paiJobInfoCollector';
import { String } from 'typescript-string-operations';
import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig';
......@@ -67,7 +67,7 @@ class PAITrainingService implements TrainingService {
private readonly hdfsDirPattern: string;
private hdfsBaseDir: string | undefined;
private hdfsOutputHost: string | undefined;
private trialSequenceId: number;
private nextTrialSequenceId: number;
private paiRestServerPort?: number;
constructor() {
......@@ -79,7 +79,7 @@ class PAITrainingService implements TrainingService {
this.experimentId = getExperimentId();
this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap);
this.hdfsDirPattern = 'hdfs://(?<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?<baseDir>/.*)?';
this.trialSequenceId = -1;
this.nextTrialSequenceId = -1;
}
public async run(): Promise<void> {
......@@ -87,7 +87,7 @@ class PAITrainingService implements TrainingService {
await restServer.start();
this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`);
while (!this.stopping) {
await this.paiJobCollector.updateTrialStatusFromPAI(this.paiToken, this.paiClusterConfig);
await this.paiJobCollector.retrieveTrialStatus(this.paiToken, this.paiClusterConfig);
await delay(3000);
}
}
......@@ -148,7 +148,7 @@ class PAITrainingService implements TrainingService {
if(!this.paiRestServerPort) {
const restServer: PAIJobRestServer = component.get(PAIJobRestServer);
this.paiRestServerPort = restServer.paiRestServerPort;
this.paiRestServerPort = restServer.clusterRestServerPort;
}
this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`);
......@@ -162,9 +162,8 @@ class PAITrainingService implements TrainingService {
//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${this.paiTrialConfig.codeDir} ${trialLocalTempFolder}`);
await cpp.exec(`mkdir -p ${path.join(trialLocalTempFolder, '.nni')}`);
const runScriptContent : string = PAI_INSTALL_NNI_SHELL_FORMAT;
const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });
......@@ -173,7 +172,6 @@ class PAITrainingService implements TrainingService {
if(trialForm) {
await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)),
trialForm.hyperParameters.value, { encoding: 'utf8' });
await fs.promises.writeFile(path.join(trialLocalTempFolder, '.nni', 'sequence_id'), trialSequenceId.toString(), { encoding: 'utf8' });
}
// Step 1. Prepare PAI job configuration
......@@ -204,6 +202,7 @@ class PAITrainingService implements TrainingService {
`$PWD/${trialJobId}/nnioutput`,
trialJobId,
this.experimentId,
trialSequenceId,
this.paiTrialConfig.command,
getIPV4Address(),
this.paiRestServerPort,
......@@ -461,11 +460,11 @@ class PAITrainingService implements TrainingService {
}
private generateSequenceId(): number {
if (this.trialSequenceId === -1) {
this.trialSequenceId = getInitTrialSequenceId();
if (this.nextTrialSequenceId === -1) {
this.nextTrialSequenceId = getInitTrialSequenceId();
}
return this.trialSequenceId++;
return this.nextTrialSequenceId++;
}
}
......
......@@ -112,6 +112,7 @@ export const REMOTEMACHINE_RUN_SHELL_FORMAT: string =
`#!/bin/bash
export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0}
export MULTI_PHASE={7}
export NNI_TRIAL_SEQ_ID={8}
cd $NNI_SYS_DIR
echo $$ >{2}
eval {3}{4} 2>{5}
......
......@@ -442,6 +442,11 @@ class RemoteMachineTrainingService implements TrainingService {
// for lint
return;
}
const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if (trialJobDetail === undefined) {
throw new Error(`Can not get trial job detail for job: ${trialJobId}`);
}
const trialLocalTempFolder: string = path.join(this.expRootDir, 'trials-local', trialJobId);
await SSHClientUtility.remoteExeCommand(`mkdir -p ${trialWorkingFolder}`, sshClient);
......@@ -463,7 +468,9 @@ class RemoteMachineTrainingService implements TrainingService {
path.join(trialWorkingFolder, 'stderr'),
path.join(trialWorkingFolder, '.nni', 'code'),
/** Mark if the trial is multi-phase job */
this.isMultiPhase);
this.isMultiPhase,
trialJobDetail.sequenceId.toString()
);
//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.join(trialLocalTempFolder, '.nni')}`);
......@@ -475,7 +482,6 @@ class RemoteMachineTrainingService implements TrainingService {
await SSHClientUtility.copyFileToRemote(
path.join(trialLocalTempFolder, 'run.sh'), path.join(trialWorkingFolder, 'run.sh'), sshClient);
await this.writeParameterFile(trialJobId, form.hyperParameters, rmScheduleInfo.rmMeta);
await this.writeSequenceIdFile(trialJobId, rmScheduleInfo.rmMeta);
// Copy files in codeDir to remote working directory
await SSHClientUtility.copyDirectoryToRemote(this.trialConfig.codeDir, trialWorkingFolder, sshClient);
......@@ -614,15 +620,6 @@ class RemoteMachineTrainingService implements TrainingService {
return this.trialSequenceId++;
}
private async writeSequenceIdFile(trialJobId: string, rmMeta: RemoteMachineMeta): Promise<void> {
const trialJobDetail: RemoteMachineTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if (trialJobDetail === undefined) {
assert(false, `Can not get trial job detail for job: ${trialJobId}`);
} else {
await this.writeRemoteTrialFile(trialJobId, trialJobDetail.sequenceId.toString(), rmMeta, path.join('.nni', 'sequence_id'));
}
}
private async writeRemoteTrialFile(trialJobId: string, fileContent: string,
rmMeta: RemoteMachineMeta, fileName: string): Promise<void> {
const sshClient: Client | undefined = this.machineSSHClientMap.get(rmMeta);
......
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as chai from 'chai';
import * as chaiAsPromised from 'chai-as-promised';
import * as fs from 'fs';
import * as tmp from 'tmp';
import * as component from '../../common/component';
import { cleanupUnitTest, prepareUnitTest } from '../../common/utils';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { KubeflowTrainingService } from '../kubeflow/kubeflowTrainingService';
// TODO: copy mockedTrail.py to local folder
const localCodeDir: string = tmp.dirSync().name
const mockedTrialPath: string = './training_service/test/mockedTrial.py'
fs.copyFileSync(mockedTrialPath, localCodeDir + '/mockedTrial.py')
describe('Unit Test for KubeflowTrainingService', () => {
let skip: boolean = false;
let testKubeflowConfig: any;
let testKubeflowTrialConfig : any;
try {
testKubeflowConfig = JSON.parse(fs.readFileSync('../../.vscode/kubeflowCluster.json', 'utf8'));
testKubeflowTrialConfig = `{\"command\":\"python3 mnist-keras.py\",\"codeDir\":\"/home/desy/nni/examples/trials/mnist",\"gpuNum\":\"1\",\"cpuNum\":\"2\",\"memoryMB\":\"8196\",\"image\":\"msranni/nni:v0.3.3\"}`;
} catch (err) {
console.log('Please configure rminfo.json to enable remote machine unit test.');
//skip = true;
}
let kubeflowTrainingService: KubeflowTrainingService;
console.log(tmp.dirSync().name);
before(() => {
chai.should();
chai.use(chaiAsPromised);
prepareUnitTest();
});
after(() => {
cleanupUnitTest();
});
beforeEach(() => {
if (skip) {
return;
}
kubeflowTrainingService = component.get(KubeflowTrainingService);
});
afterEach(() => {
if (skip) {
return;
}
kubeflowTrainingService.cleanUp();
});
it('Simple Test', async () => {
if (skip) {
return;
}
kubeflowTrainingService.setClusterMetadata(TrialConfigMetadataKey.KUBEFLOW_CLUSTER_CONFIG, testKubeflowConfig),
kubeflowTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, testKubeflowTrialConfig);
try {
const trialDetail = await kubeflowTrainingService.submitTrialJob({jobType : 'TRIAL'});
} catch(error) {
console.log('Submit job failed:' + error);
chai.assert(error)
}
});
});
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment