Unverified Commit d3506e34 authored by fishyds's avatar fishyds Committed by GitHub
Browse files

PAI Training Service implementation (#128)

* PAI Training service implementation
**1. Implement PAITrainingService
**2. Add trial-keeper python module, and modify setup.py to install the module
**3. Add PAItrainingService rest server to collect metrics from PAI container.
parent cc5372e2
...@@ -92,7 +92,6 @@ build: ...@@ -92,7 +92,6 @@ build:
#$(_INFO) Building nnictl $(_END) #$(_INFO) Building nnictl $(_END)
cd tools && python3 setup.py build cd tools && python3 setup.py build
# Standard installation target # Standard installation target
# Must be invoked after building # Must be invoked after building
.PHONY: install .PHONY: install
...@@ -207,7 +206,6 @@ install-python-modules: ...@@ -207,7 +206,6 @@ install-python-modules:
#$(_INFO) Installing nnictl $(_END) #$(_INFO) Installing nnictl $(_END)
cd tools && python3 setup.py install $(PIP_MODE) cd tools && python3 setup.py install $(PIP_MODE)
.PHONY: install-node-modules .PHONY: install-node-modules
install-node-modules: install-node-modules:
mkdir -p $(INSTALL_PREFIX)/nni mkdir -p $(INSTALL_PREFIX)/nni
...@@ -227,7 +225,7 @@ install-dev-modules: ...@@ -227,7 +225,7 @@ install-dev-modules:
#$(_INFO) Installing nnictl $(_END) #$(_INFO) Installing nnictl $(_END)
cd tools && $(PIP_INSTALL) $(PIP_MODE) -e . cd tools && $(PIP_INSTALL) $(PIP_MODE) -e .
mkdir -p $(INSTALL_PREFIX)/nni mkdir -p $(INSTALL_PREFIX)/nni
#$(_INFO) Installing NNI Manager $(_END) #$(_INFO) Installing NNI Manager $(_END)
......
...@@ -35,7 +35,7 @@ class CustomInstallCommand(install): ...@@ -35,7 +35,7 @@ class CustomInstallCommand(install):
setup( setup(
name = 'NNI', name = 'NNI',
version = '0.1.0', version = '0.2.0',
author = 'Microsoft NNI Team', author = 'Microsoft NNI Team',
author_email = 'nni@microsoft.com', author_email = 'nni@microsoft.com',
description = 'Neural Network Intelligence project', description = 'Neural Network Intelligence project',
...@@ -47,7 +47,8 @@ setup( ...@@ -47,7 +47,8 @@ setup(
package_dir = { package_dir = {
'nni_annotation': 'tools/nni_annotation', 'nni_annotation': 'tools/nni_annotation',
'nni': 'src/sdk/pynni/nni', 'nni': 'src/sdk/pynni/nni',
'nnicmd': 'tools/nnicmd' 'nnicmd': 'tools/nnicmd',
'trial_tool':'tools/trial_tool'
}, },
python_requires = '>=3.5', python_requires = '>=3.5',
install_requires = [ install_requires = [
...@@ -59,7 +60,8 @@ setup( ...@@ -59,7 +60,8 @@ setup(
'pyyaml', 'pyyaml',
'requests', 'requests',
'scipy', 'scipy',
'schema' 'schema',
'pyhdfs'
], ],
cmdclass={ cmdclass={
......
...@@ -225,5 +225,19 @@ function cleanupUnitTest(): void { ...@@ -225,5 +225,19 @@ function cleanupUnitTest(): void {
Container.restore(ExperimentStartupInfo); Container.restore(ExperimentStartupInfo);
} }
export { getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getDefaultDatabaseDir, mkDirP, delay, prepareUnitTest, /**
parseArg, cleanupUnitTest, uniqueString, randomSelect }; * Get IPv4 address of current machine
*/
function getIPV4Address(): string {
let ipv4Address : string = '';
for(const item of os.networkInterfaces().eth0) {
if(item.family === 'IPv4') {
ipv4Address = item.address;
}
}
return ipv4Address;
}
export { getMsgDispatcherCommand, getLogDir, getExperimentRootDir, getDefaultDatabaseDir, getIPV4Address,
mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect };
...@@ -36,6 +36,7 @@ import { LocalTrainingServiceForGPU } from './training_service/local/localTraini ...@@ -36,6 +36,7 @@ import { LocalTrainingServiceForGPU } from './training_service/local/localTraini
import { import {
RemoteMachineTrainingService RemoteMachineTrainingService
} from './training_service/remote_machine/remoteMachineTrainingService'; } from './training_service/remote_machine/remoteMachineTrainingService';
import { PAITrainingService } from './training_service/pai/paiTrainingService'
function initStartupInfo(startExpMode: string, resumeExperimentId: string) { function initStartupInfo(startExpMode: string, resumeExperimentId: string) {
...@@ -49,6 +50,8 @@ async function initContainer(platformMode: string): Promise<void> { ...@@ -49,6 +50,8 @@ async function initContainer(platformMode: string): Promise<void> {
Container.bind(TrainingService).to(LocalTrainingServiceForGPU).scope(Scope.Singleton); Container.bind(TrainingService).to(LocalTrainingServiceForGPU).scope(Scope.Singleton);
} else if (platformMode === 'remote') { } else if (platformMode === 'remote') {
Container.bind(TrainingService).to(RemoteMachineTrainingService).scope(Scope.Singleton); Container.bind(TrainingService).to(RemoteMachineTrainingService).scope(Scope.Singleton);
} else if (platformMode === 'pai'){
Container.bind(TrainingService).to(PAITrainingService).scope(Scope.Singleton);
} else { } else {
throw new Error(`Error: unsupported mode: ${mode}`); throw new Error(`Error: unsupported mode: ${mode}`);
} }
...@@ -61,7 +64,7 @@ async function initContainer(platformMode: string): Promise<void> { ...@@ -61,7 +64,7 @@ async function initContainer(platformMode: string): Promise<void> {
} }
function usage(): void { function usage(): void {
console.info('usage: node main.js --port <port> --mode <local/remote> --start_mode <new/resume> --experiment_id <id>'); console.info('usage: node main.js --port <port> --mode <local/remote/pai> --start_mode <new/resume> --experiment_id <id>');
} }
let port: number = NNIRestServer.DEFAULT_PORT; let port: number = NNIRestServer.DEFAULT_PORT;
...@@ -71,7 +74,7 @@ if (strPort && strPort.length > 0) { ...@@ -71,7 +74,7 @@ if (strPort && strPort.length > 0) {
} }
const mode: string = parseArg(['--mode', '-m']); const mode: string = parseArg(['--mode', '-m']);
if (!['local', 'remote'].includes(mode)) { if (!['local', 'remote', 'pai'].includes(mode)) {
usage(); usage();
process.exit(1); process.exit(1);
} }
......
...@@ -23,7 +23,8 @@ ...@@ -23,7 +23,8 @@
"tree-kill": "^1.2.0", "tree-kill": "^1.2.0",
"ts-deferred": "^1.0.4", "ts-deferred": "^1.0.4",
"typescript-ioc": "^1.2.4", "typescript-ioc": "^1.2.4",
"typescript-string-operations": "^1.3.1" "typescript-string-operations": "^1.3.1",
"webhdfs":"^1.2.0"
}, },
"devDependencies": { "devDependencies": {
"@types/chai": "^4.1.4", "@types/chai": "^4.1.4",
...@@ -40,6 +41,7 @@ ...@@ -40,6 +41,7 @@
"chai": "^4.1.2", "chai": "^4.1.2",
"mocha": "^5.2.0", "mocha": "^5.2.0",
"request": "^2.87.0", "request": "^2.87.0",
"rmdir": "^1.2.0",
"tmp": "^0.0.33", "tmp": "^0.0.33",
"ts-node": "^7.0.0", "ts-node": "^7.0.0",
"tslint": "^5.11.0", "tslint": "^5.11.0",
......
...@@ -33,9 +33,19 @@ export namespace ValidationSchemas { ...@@ -33,9 +33,19 @@ export namespace ValidationSchemas {
passphrase: joi.string() passphrase: joi.string()
})), })),
trial_config: joi.object({ trial_config: joi.object({
gpuNum: joi.number().min(0).required(), image: joi.string().min(1),
codeDir: joi.string().min(1).required(), codeDir: joi.string().min(1).required(),
command: joi.string().min(1).required() dataDir: joi.string(),
outputDir: joi.string(),
cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required()
}),
pai_config: joi.object({
userName: joi.string().min(1).required(),
passWord: joi.string().min(1).required(),
host: joi.string().min(1).required()
}) })
} }
}; };
......
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { TrialJobStatus } from '../../common/trainingService';
// tslint:disable-next-line:max-classes-per-file
export class JobMetrics {
public readonly jobId: string;
public readonly metrics: string[];
public readonly jobStatus: TrialJobStatus;
public readonly endTimestamp: number;
constructor(jobId : string, metrics : string[], jobStatus : TrialJobStatus, endTimestamp : number) {
this.jobId = jobId;
this.metrics = metrics;
this.jobStatus = jobStatus;
this.endTimestamp = endTimestamp;
}
}
...@@ -26,5 +26,6 @@ export enum TrialConfigMetadataKey { ...@@ -26,5 +26,6 @@ export enum TrialConfigMetadataKey {
MACHINE_LIST = 'machine_list', MACHINE_LIST = 'machine_list',
TRIAL_CONFIG = 'trial_config', TRIAL_CONFIG = 'trial_config',
EXPERIMENT_ID = 'experimentId', EXPERIMENT_ID = 'experimentId',
RANDOM_SCHEDULER = 'random_scheduler' RANDOM_SCHEDULER = 'random_scheduler',
PAI_CLUSTER_CONFIG = 'pai_config'
} }
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
import * as path from 'path';
import * as fs from 'fs';
import { Deferred } from 'ts-deferred';
import { getLogger } from '../../common/log';
/**
* HDFS client utility, including copy file/directory
*/
export namespace HDFSClientUtility {
/**
* Copy a local file to hdfs directory
*
* @param localFilePath local file path(source)
* @param hdfsFilePath hdfs file path(target)
* @param hdfsClient hdfs client
*/
export async function copyFileToHdfs(localFilePath : string, hdfsFilePath : string, hdfsClient : any) : Promise<void> {
const deferred: Deferred<void> = new Deferred<void>();
fs.exists(localFilePath, (exists : boolean) => {
// Detect if local file exist
if (exists) {
var localFileStream = fs.createReadStream(localFilePath);
var hdfsFileStream = hdfsClient.createWriteStream(hdfsFilePath);
localFileStream.pipe(hdfsFileStream);
hdfsFileStream.on('finish', function onFinish () {
deferred.resolve();
});
hdfsFileStream.on('error', (err : any) => {
getLogger().error(`HDFSCientUtility:copyFileToHdfs, copy file failed, err is ${err.message}`);
deferred.reject(err);
});
} else {
getLogger().error(`HDFSCientUtility:copyFileToHdfs, ${localFilePath} doesn't exist locally`);
deferred.reject('file not exist!');
}
});
return deferred.promise;
}
/**
* Recursively copy local directory to hdfs directory
*
* @param localDirectory local directory
* @param hdfsDirectory HDFS directory
* @param hdfsClient HDFS client
*/
export async function copyDirectoryToHdfs(localDirectory : string, hdfsDirectory : string, hdfsClient : any) : Promise<void>{
const deferred: Deferred<void> = new Deferred<void>();
// TODO: fs.readdirSync doesn't support ~($HOME)
const fileNameArray: string[] = fs.readdirSync(localDirectory);
for(var fileName of fileNameArray){
const fullFilePath: string = path.join(localDirectory, fileName);
try {
if (fs.lstatSync(fullFilePath).isFile()) {
await copyFileToHdfs(fullFilePath, path.join(hdfsDirectory, fileName), hdfsClient);
} else {
// If filePath is a directory, recuisively copy it to remote directory
await copyDirectoryToHdfs(fullFilePath, path.join(hdfsDirectory, fileName), hdfsClient);
}
} catch(error) {
deferred.reject(error);
}
}
// All files/directories are copied successfully, resolve
deferred.resolve();
return deferred.promise;
}
/**
* Read content from HDFS file
*
* @param hdfsPath HDFS file path
* @param hdfsClient HDFS client
*/
export async function readFileFromHDFS(hdfsPath : string, hdfsClient :any) : Promise<Buffer> {
const deferred: Deferred<Buffer> = new Deferred<Buffer>();
let buffer : Buffer = Buffer.alloc(0);
const exist : boolean = await pathExists(hdfsPath, hdfsClient);
if(!exist) {
deferred.reject(`${hdfsPath} doesn't exists`);
}
const remoteFileStream = hdfsClient.createReadStream(hdfsPath);
remoteFileStream.on('error', (err : any) => {
// Reject with the error
deferred.reject(err);
});
remoteFileStream.on('data', (chunk : any) => {
// Concat the data chunk to buffer
buffer = Buffer.concat([buffer, chunk]);
});
remoteFileStream.on('finish', function onFinish () {
// Upload is done, resolve
deferred.resolve(buffer);
});
return deferred.promise;
}
/**
* Check if an HDFS path already exists
*
* @param hdfsPath target path need to check in HDFS
* @param hdfsClient HDFS client
*/
export async function pathExists(hdfsPath : string, hdfsClient : any) : Promise<boolean> {
const deferred : Deferred<boolean> = new Deferred<boolean>();
hdfsClient.exists(hdfsPath, (exist : boolean ) => {
deferred.resolve(exist);
})
return deferred.promise;
}
/**
* Mkdir in HDFS, use default permission 755
*
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
*/
export function mkdir(hdfsPath : string, hdfsClient : any) : Promise<boolean> {
const deferred : Deferred<boolean> = new Deferred<boolean>();
hdfsClient.mkdir(hdfsPath, (err : any)=> {
if(!err) {
deferred.resolve(true);
} else {
deferred.reject(err.message);
}
});
return deferred.promise;
}
/**
* Read directory contents
*
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
*/
export async function readdir(hdfsPath : string, hdfsClient : any) : Promise<string[]> {
const deferred : Deferred<string[]> = new Deferred<string[]>();
const exist : boolean = await pathExists(hdfsPath, hdfsClient);
if(!exist) {
deferred.reject(`${hdfsPath} doesn't exists`);
}
hdfsClient.readdir(hdfsPath, (err : any, files : any[] ) => {
if(err) {
deferred.reject(err);
}
deferred.resolve(files);
});
return deferred.promise;
}
/**
* Delete HDFS path
* @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient
* @param recursive Mark if need to delete recursively
*/
export function deletePath(hdfsPath : string, hdfsClient : any, recursive : boolean = true) : Promise<boolean> {
const deferred : Deferred<boolean> = new Deferred<boolean>();
hdfsClient.unlink(hdfsPath, recursive, (err : any)=> {
if(!err) {
deferred.resolve(true);
} else {
deferred.reject(err.message);
}
});
return deferred.promise;
}
}
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import {TrialConfig} from '../common/trialConfig'
export class PAITaskRole {
// Name for the task role
public readonly name: string;
// Number of tasks for the task role, no less than 1
public readonly taskNumber: number;
// CPU number for one task in the task role, no less than 1
public readonly cpuNumber: number;
// Memory for one task in the task role, no less than 100
public readonly memoryMB: number;
// GPU number for one task in the task role, no less than 0
public readonly gpuNumber: number;
// Executable command for tasks in the task role, can not be empty
public readonly command: string;
/**
* Constructor
* @param name Name for the task role
* @param taskNumber Number of tasks for the task role, no less than 1
* @param cpuNumber CPU number for one task in the task role, no less than 1
* @param memoryMB Memory for one task in the task role, no less than 100
* @param gpuNumber GPU number for one task in the task role, no less than 0
* @param command Executable command for tasks in the task role, can not be empty
*/
constructor(name : string, taskNumber : number, cpuNumber : number, memoryMB : number, gpuNumber : number, command : string) {
this.name = name;
this.taskNumber = taskNumber;
this.cpuNumber = cpuNumber;
this.memoryMB = memoryMB;
this.gpuNumber = gpuNumber;
this.command = command;
}
}
export class PAIJobConfig{
// Name for the job, need to be unique
public readonly jobName: string;
// URL pointing to the Docker image for all tasks in the job
public readonly image: string;
// Data directory existing on HDFS
public readonly dataDir: string;
// Output directory on HDFS
public readonly outputDir: string;
// Code directory on HDFS
public readonly codeDir: string;
// List of taskRole, one task role at least
public taskRoles: PAITaskRole[];
/**
* Constructor
* @param jobName Name for the job, need to be unique
* @param image URL pointing to the Docker image for all tasks in the job
* @param dataDir Data directory existing on HDFS
* @param outputDir Output directory on HDFS
* @param taskRoles List of taskRole, one task role at least
*/
constructor(jobName: string, image : string, dataDir : string, outputDir : string, codeDir : string, taskRoles : PAITaskRole[]){
this.jobName = jobName;
this.image = image;
this.dataDir = dataDir;
this.outputDir = outputDir;
this.codeDir = codeDir;
this.taskRoles = taskRoles;
}
}
export class PAIClusterConfig {
public readonly userName: string;
public readonly passWord: string;
public readonly host: string;
/**
* Constructor
* @param userName User name of PAI Cluster
* @param passWord password of PAI Cluster
* @param host Host IP of PAI Cluster
*/
constructor(userName: string, passWord : string, host : string){
this.userName = userName;
this.passWord = passWord;
this.host = host;
}
}
export class NNIPAITrialConfig extends TrialConfig{
public readonly cpuNum: number;
public readonly memoryMB: number;
public readonly image: string;
public readonly dataDir: string;
public outputDir: string;
constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, image: string, dataDir: string, outputDir: string) {
super(command, codeDir, gpuNum);
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
this.image = image;
this.dataDir = dataDir;
this.outputDir = outputDir;
}
}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from 'common/trainingService';
export class PAITrialJobDetail implements TrialJobDetail {
public id: string;
public status: TrialJobStatus;
public paiJobName: string;
public submitTime: number;
public startTime?: number;
public endTime?: number;
public tags?: string[];
public url?: string;
public workingDirectory: string;
public form: JobApplicationForm;
public hdfsLogPath: string;
constructor(id: string, status: TrialJobStatus, paiJobName : string,
submitTime: number, workingDirectory: string, form: JobApplicationForm, hdfsLogPath: string) {
this.id = id;
this.status = status;
this.paiJobName = paiJobName;
this.submitTime = submitTime;
this.workingDirectory = workingDirectory;
this.form = form;
this.tags = [];
this.hdfsLogPath = hdfsLogPath;
}
}
export const PAI_TRIAL_COMMAND_FORMAT: string =
`pip3 install -v --user git+https://github.com/Microsoft/nni.git@master
&& export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_EXP_ID={2}
&& cd $NNI_SYS_DIR && mkdir .nni
&& python3 -m trial_tool.trial_keeper --trial_command '{3}' --nnimanager_ip '{4}' --pai_hdfs_output_dir '{5}'
--pai_hdfs_host '{6}' --pai_user_name {7}`;
export const PAI_OUTPUT_DIR_FORMAT: string =
`hdfs://{0}:9000/`;
export const PAI_LOG_PATH_FORMAT: string =
`http://{0}:50070/explorer.html#{1}`
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as request from 'request';
import { EventEmitter } from 'events';
import { Deferred } from 'ts-deferred';
import { getLogger, Logger } from '../../common/log';
import { NNIError, NNIErrorNames } from '../../common/errors';
import { PAITrialJobDetail } from './paiData';
import { PAIClusterConfig } from './paiConfig';
import { TrialJobStatus } from '../../common/trainingService';
/**
* Collector PAI jobs info from PAI cluster, and update pai job status locally
*/
export class PAIJobInfoCollector {
private readonly trialJobsMap : Map<string, PAITrialJobDetail>;
private readonly log: Logger = getLogger();
private readonly statusesNeedToCheck : TrialJobStatus[];
private readonly finalStatuses : TrialJobStatus[];
constructor(jobMap: Map<string, PAITrialJobDetail>) {
this.trialJobsMap = jobMap;
this.statusesNeedToCheck = ['RUNNING', 'UNKNOWN', 'WAITING'];
this.finalStatuses = ['SUCCEEDED', 'FAILED', 'USER_CANCELED', 'SYS_CANCELED'];
}
public async updateTrialStatusFromPAI(paiToken? : string, paiClusterConfig?: PAIClusterConfig) : Promise<void> {
if (!paiClusterConfig || !paiToken) {
return Promise.resolve();
}
const updatePaiTrialJobs : Promise<void>[] = [];
for(let [trialJobId, paiTrialJob] of this.trialJobsMap) {
if (!paiTrialJob) {
throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`);
}
updatePaiTrialJobs.push(this.getSinglePAITrialJobInfo(paiTrialJob, paiToken, paiClusterConfig))
}
await Promise.all(updatePaiTrialJobs);
}
private getSinglePAITrialJobInfo(paiTrialJob : PAITrialJobDetail, paiToken : string, paiClusterConfig: PAIClusterConfig) : Promise<void> {
const deferred : Deferred<void> = new Deferred<void>();
if (!this.statusesNeedToCheck.includes(paiTrialJob.status)) {
deferred.resolve();
return deferred.promise;
}
// Rest call to get PAI job info and update status
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
const getJobInfoRequest: request.Options = {
uri: `http://${paiClusterConfig.host}:9186/api/v1/jobs/${paiTrialJob.paiJobName}`,
method: 'GET',
json: true,
headers: {
"Content-Type": "application/json",
"Authorization": 'Bearer ' + paiToken
}
};
//TODO : pass in request timeout param?
request(getJobInfoRequest, (error: Error, response: request.Response, body: any) => {
if (error || response.statusCode >= 500) {
this.log.error(`PAI Training service: get job info for trial ${paiTrialJob.id} from PAI Cluster failed!`);
// Queried PAI job info failed, set job status to UNKNOWN
if(paiTrialJob.status === 'WAITING' || paiTrialJob.status === 'RUNNING') {
paiTrialJob.status = 'UNKNOWN';
}
} else {
if(response.body.jobStatus && response.body.jobStatus.state) {
switch(response.body.jobStatus.state) {
case 'WAITING':
paiTrialJob.status = 'WAITING';
break;
case 'RUNNING':
paiTrialJob.status = 'RUNNING';
if(!paiTrialJob.startTime) {
paiTrialJob.startTime = response.body.jobStatus.appLaunchedTime;
}
if(!paiTrialJob.url) {
paiTrialJob.url = response.body.jobStatus.appTrackingUrl;
}
break;
case 'SUCCEEDED':
paiTrialJob.status = 'SUCCEEDED';
break;
case 'STOPPED':
paiTrialJob.status = 'USER_CANCELED';
break;
case 'FAILED':
paiTrialJob.status = 'FAILED';
break;
default:
paiTrialJob.status = 'UNKNOWN';
break;
}
// For final job statues, update startTime, endTime and url
if(this.finalStatuses.includes(paiTrialJob.status)) {
if(!paiTrialJob.startTime) {
paiTrialJob.startTime = response.body.jobStatus.appLaunchedTime;
}
if(!paiTrialJob.endTime) {
paiTrialJob.endTime = response.body.jobStatus.completedTime;
}
// Set pai trial job's url to WebHDFS output path
if(paiTrialJob.hdfsLogPath) {
paiTrialJob.url = paiTrialJob.hdfsLogPath;
}
}
}
}
deferred.resolve();
});
return deferred.promise;
}
}
\ No newline at end of file
...@@ -17,4 +17,82 @@ ...@@ -17,4 +17,82 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/ */
'use strict';
\ No newline at end of file
import { Request, Response, Router } from 'express';
import * as bodyParser from 'body-parser';
import * as component from '../../common/component';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { Inject } from 'typescript-ioc';
import { PAITrainingService } from './paiTrainingService';
import { RestServer } from '../../common/restServer'
/**
* PAI Training service Rest server, provides rest API to support pai job metrics update
*
*/
@component.Singleton
export class PAIJobRestServer extends RestServer{
/** NNI main rest service default port */
private static readonly DEFAULT_PORT: number = 51189;
private readonly API_ROOT_URL: string = '/api/v1/nni-pai';
private readonly expId: string = getExperimentId();
@Inject
private readonly paiTrainingService : PAITrainingService;
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
constructor() {
super();
this.port = PAIJobRestServer.DEFAULT_PORT;
this.paiTrainingService = component.get(PAITrainingService);
}
/**
* NNIRestServer's own router registration
*/
protected registerRestHandler(): void {
this.app.use(bodyParser.json());
this.app.use(this.API_ROOT_URL, this.createRestHandler());
}
private createRestHandler() : Router {
const router: Router = Router();
// tslint:disable-next-line:typedef
router.use((req: Request, res: Response, next) => {
this.log.info(`${req.method}: ${req.url}: body:\n${JSON.stringify(req.body, undefined, 4)}`);
res.setHeader('Content-Type', 'application/json');
next();
});
router.post(`/update-metrics/${this.expId}/:trialId`, (req: Request, res: Response) => {
try {
this.log.info(`Get update-metrics request, trial job id is ${req.params.trialId}`);
this.log.info(`update-metrics body is ${JSON.stringify(req.body)}`);
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN
for (const singleMetric of req.body.metrics) {
this.paiTrainingService.MetricsEmitter.emit('metric', {
id : req.body.jobId,
data : singleMetric
});
}
res.send();
}
catch(err) {
this.log.error(`json parse metrics error: ${err}`);
res.status(500);
res.send(err.message);
}
});
return router;
}
}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict'
import * as component from '../../common/component';
import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import * as path from 'path';
import * as request from 'request';
import { Deferred } from 'ts-deferred';
import { EventEmitter } from 'events';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { HDFSClientUtility } from './hdfsClientUtility'
import { MethodNotImplementedError } from '../../common/errors';
import { getLogger, Logger } from '../../common/log';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import {
JobApplicationForm, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric
} from '../../common/trainingService';
import { delay, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils';
import { PAIJobRestServer } from './paiJobRestServer'
import { PAITrialJobDetail, PAI_TRIAL_COMMAND_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_LOG_PATH_FORMAT } from './paiData';
import { PAIJobInfoCollector } from './paiJobInfoCollector';
import { String } from 'typescript-string-operations';
import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig';
var WebHDFS = require('webhdfs');
/**
* Training Service implementation for OpenPAI (Open Platform for AI)
* Refer https://github.com/Microsoft/pai for more info about OpenPAI
*/
@component.Singleton
class PAITrainingService implements TrainingService {
private readonly log!: Logger;
private readonly metricsEmitter: EventEmitter;
private readonly trialJobsMap: Map<string, PAITrialJobDetail>;
private readonly expRootDir: string;
private paiTrialConfig: NNIPAITrialConfig | undefined;
private paiClusterConfig?: PAIClusterConfig;
private stopping: boolean = false;
private hdfsClient: any;
private paiToken? : string;
private experimentId! : string;
private readonly paiJobCollector : PAIJobInfoCollector;
private readonly hdfsDirPattern: string;
constructor() {
this.log = getLogger();
this.metricsEmitter = new EventEmitter();
this.trialJobsMap = new Map<string, PAITrialJobDetail>();
// Root dir on HDFS
this.expRootDir = path.join('/nni', 'experiments', getExperimentId());
this.experimentId = getExperimentId();
this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap);
this.hdfsDirPattern = 'hdfs://(?<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?<baseDir>/.*)?';
}
public async run(): Promise<void> {
const restServer: PAIJobRestServer = component.get(PAIJobRestServer);
await restServer.start();
this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`);
while (!this.stopping) {
await this.paiJobCollector.updateTrialStatusFromPAI(this.paiToken, this.paiClusterConfig);
await delay(3000);
}
}
public async listTrialJobs(): Promise<TrialJobDetail[]> {
const jobs: TrialJobDetail[] = [];
this.trialJobsMap.forEach(async (value: PAITrialJobDetail, key: string) => {
if (value.form.jobType === 'TRIAL') {
jobs.push(await this.getTrialJob(key));
}
});
return Promise.resolve(jobs);
}
public getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
if(!this.paiClusterConfig) {
throw new Error('PAI Cluster config is not initialized');
}
const paiTrialJob: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if (!paiTrialJob) {
return Promise.reject(`trial job ${trialJobId} not found`)
}
return Promise.resolve(paiTrialJob);
}
public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void) {
this.metricsEmitter.on('metric', listener);
}
public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void) {
this.metricsEmitter.off('metric', listener);
}
public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> {
const deferred : Deferred<PAITrialJobDetail> = new Deferred<PAITrialJobDetail>();
if(!this.paiClusterConfig) {
throw new Error('PAI Cluster config is not initialized');
}
if (!this.paiTrialConfig) {
throw new Error('trial config is not initialized');
}
if (!this.paiToken) {
throw new Error('PAI token is not initialized');
}
this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`);
const trialJobId: string = uniqueString(5);
//TODO: use HDFS working folder instead
const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId);
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${this.paiTrialConfig.codeDir} ${trialLocalTempFolder}`);
// Write file content ( parameter.cfg ) to local tmp folders
const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form)
if(trialForm) {
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'parameter.cfg'), trialForm.hyperParameters, { encoding: 'utf8' });
}
// Step 1. Prepare PAI job configuration
const paiJobName : string = `nni_exp_${this.experimentId}_trial_${trialJobId}`;
const hdfsCodeDir : string = path.join(this.expRootDir, trialJobId);
const hdfsDirContent = this.paiTrialConfig.outputDir.match(this.hdfsDirPattern);
if(hdfsDirContent === null) {
throw new Error('Trial outputDir format Error');
}
const groups = hdfsDirContent.groups;
if(groups === undefined) {
throw new Error('Trial outputDir format Error');
}
const hdfsHost = groups['host'];
let hdfsBaseDirectory = groups['baseDir'];
if(hdfsBaseDirectory === undefined) {
hdfsBaseDirectory = "/";
}
const hdfsOutputDir : string = path.join(hdfsBaseDirectory, this.experimentId, trialJobId);
const hdfsLogPath : string = String.Format(
PAI_LOG_PATH_FORMAT,
hdfsHost,
hdfsOutputDir);
const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail(
trialJobId,
'WAITING',
paiJobName,
Date.now(),
trialWorkingFolder,
form,
hdfsLogPath);
this.trialJobsMap.set(trialJobId, trialJobDetail);
const nniPaiTrialCommand : string = String.Format(
PAI_TRIAL_COMMAND_FORMAT,
// PAI will copy job's codeDir into /root directory
`/root/${trialJobId}`,
trialJobId,
this.experimentId,
this.paiTrialConfig.command,
getIPV4Address(),
hdfsOutputDir,
hdfsHost,
this.paiClusterConfig.userName
).replace(/\r\n|\n|\r/gm, '');
console.log(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`);
const paiTaskRoles : PAITaskRole[] = [new PAITaskRole('nni_trail_' + trialJobId,
// Task role number
1,
// Task CPU number
this.paiTrialConfig.cpuNum,
// Task memory
this.paiTrialConfig.memoryMB,
// Task GPU number
this.paiTrialConfig.gpuNum,
// Task command
nniPaiTrialCommand)];
const paiJobConfig : PAIJobConfig = new PAIJobConfig(
// Job name
paiJobName,
// Docker image
this.paiTrialConfig.image,
// dataDir
this.paiTrialConfig.dataDir,
// outputDir
this.paiTrialConfig.outputDir,
// codeDir
`$PAI_DEFAULT_FS_URI${hdfsCodeDir}`,
// TODO: Add Virutal Cluster
// PAI Task roles
paiTaskRoles);
// Step 2. Upload code files in codeDir onto HDFS
try {
await HDFSClientUtility.copyDirectoryToHdfs(trialLocalTempFolder, hdfsCodeDir, this.hdfsClient);
} catch (error) {
this.log.error(`PAI Training service: copy ${this.paiTrialConfig.codeDir} to HDFS ${hdfsCodeDir} failed, error is ${error}`);
throw new Error(error.message);
}
// Step 3. Submit PAI job via Rest call
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
const submitJobRequest: request.Options = {
uri: `http://${this.paiClusterConfig.host}:9186/api/v1/jobs`,
method: 'POST',
json: true,
body: paiJobConfig,
headers: {
"Content-Type": "application/json",
"Authorization": 'Bearer ' + this.paiToken
}
};
request(submitJobRequest, (error: Error, response: request.Response, body: any) => {
if (error || response.statusCode >= 400) {
this.log.error(`PAI Training service: Submit trial ${trialJobId} to PAI Cluster failed!`);
trialJobDetail.status = 'FAILED';
deferred.reject(error ? error.message : 'Submit trial failed, http code: ' + response.statusCode);
} else {
trialJobDetail.submitTime = Date.now();
deferred.resolve(trialJobDetail);
}
});
return deferred.promise;
}
public updateTrialJob(trialJobId: string, form: JobApplicationForm): Promise<TrialJobDetail> {
throw new MethodNotImplementedError();
}
public get isMultiPhaseJobSupported(): boolean {
return false;
}
public cancelTrialJob(trialJobId: string): Promise<void> {
const trialJobDetail : PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
const deferred : Deferred<void> = new Deferred<void>();
if(!trialJobDetail) {
this.log.error(`cancelTrialJob: trial job id ${trialJobId} not found`);
return Promise.reject();
}
if(!this.paiClusterConfig) {
throw new Error('PAI Cluster config is not initialized');
}
if (!this.paiToken) {
throw new Error('PAI token is not initialized');
}
const stopJobRequest: request.Options = {
uri: `http://${this.paiClusterConfig.host}:9186/api/v1/jobs/${trialJobDetail.paiJobName}/executionType`,
method: 'PUT',
json: true,
body: {'value' : 'STOP'},
headers: {
"Content-Type": "application/json",
"Authorization": 'Bearer ' + this.paiToken
}
};
request(stopJobRequest, (error: Error, response: request.Response, body: any) => {
if (error || response.statusCode >= 400) {
this.log.error(`PAI Training service: stop trial ${trialJobId} to PAI Cluster failed!`);
deferred.reject(error ? error.message : 'Stop trial failed, http code: ' + response.statusCode);
} else {
deferred.resolve();
}
});
return deferred.promise;
}
public setClusterMetadata(key: string, value: string): Promise<void> {
const deferred : Deferred<void> = new Deferred<void>();
switch (key) {
case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG:
//TODO: try catch exception when setting up HDFS client and get PAI token
this.paiClusterConfig = <PAIClusterConfig>JSON.parse(value);
this.hdfsClient = WebHDFS.createClient({
user: this.paiClusterConfig.userName,
port: 50070,
host: this.paiClusterConfig.host
});
// Get PAI authentication token
const authentication_req: request.Options = {
uri: `http://${this.paiClusterConfig.host}:9186/api/v1/token`,
method: 'POST',
json: true,
body: {
username: this.paiClusterConfig.userName,
password: this.paiClusterConfig.passWord
}
};
request(authentication_req, (error: Error, response: request.Response, body: any) => {
if (error) {
//TODO: should me make the setClusterMetadata's return type to Promise<string>?
this.log.error(`Get PAI token failed: ${error.message}`);
deferred.reject();
} else {
if(response.statusCode !== 200){
this.log.error(`Get PAI token failed: get PAI Rest return code ${response.statusCode}`);
deferred.reject();
}
this.paiToken = body.token;
deferred.resolve();
}
});
break;
case TrialConfigMetadataKey.TRIAL_CONFIG:
if (!this.paiClusterConfig){
this.log.error('pai cluster config is not initialized');
deferred.reject();
break;
}
this.paiTrialConfig = <NNIPAITrialConfig>JSON.parse(value);
//paiTrialConfig.outputDir could be null if it is not set in nnictl
if(this.paiTrialConfig.outputDir === undefined || this.paiTrialConfig.outputDir === null){
this.paiTrialConfig.outputDir = String.Format(
PAI_OUTPUT_DIR_FORMAT,
this.paiClusterConfig.host
).replace(/\r\n|\n|\r/gm, '');
}
deferred.resolve();
break;
default:
//Reject for unknown keys
throw new Error(`Uknown key: ${key}`);
}
return deferred.promise;
}
public getClusterMetadata(key: string): Promise<string> {
const deferred : Deferred<string> = new Deferred<string>();
deferred.resolve();
return deferred.promise;
}
public async cleanUp(): Promise<void> {
this.stopping = true;
const deferred : Deferred<void> = new Deferred<void>();
const restServer: PAIJobRestServer = component.get(PAIJobRestServer);
try {
await restServer.stop();
deferred.resolve();
this.log.info('PAI Training service rest server stopped successfully.');
} catch (error) {
this.log.error(`PAI Training service rest server stopped failed, error: ${error.message}`);
deferred.reject(error);
}
return deferred.promise;
}
public get MetricsEmitter() : EventEmitter {
return this.metricsEmitter;
}
}
export { PAITrainingService }
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import {TrialConfig} from '../common/trialConfig'
export class PAITrialConfig extends TrialConfig{
public readonly cpuNum: number;
public readonly memoryMB: number;
public readonly image: string;
public readonly dataDir: string;
public readonly outputDir: string;
constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, image: string, dataDir: string, outputDir: string) {
super(command, codeDir, gpuNum);
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
this.image = image;
this.dataDir = dataDir;
this.outputDir = outputDir;
}
}
\ No newline at end of file
...@@ -25,7 +25,8 @@ import * as path from 'path'; ...@@ -25,7 +25,8 @@ import * as path from 'path';
import { Client } from 'ssh2'; import { Client } from 'ssh2';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { TrialJobStatus, TrialJobDetail } from '../../common/trainingService'; import { TrialJobStatus, TrialJobDetail } from '../../common/trainingService';
import { JobMetrics, RemoteCommandResult, RemoteMachineMeta, RemoteMachineTrialJobDetail } from './remoteMachineData'; import { JobMetrics } from '../common/jobMetrics';
import { RemoteCommandResult, RemoteMachineMeta, RemoteMachineTrialJobDetail } from './remoteMachineData';
import { SSHClientUtility } from './sshClientUtility'; import { SSHClientUtility } from './sshClientUtility';
export class MetricsCollector { export class MetricsCollector {
......
...@@ -65,21 +65,6 @@ export class RemoteCommandResult { ...@@ -65,21 +65,6 @@ export class RemoteCommandResult {
} }
} }
// tslint:disable-next-line:max-classes-per-file
export class JobMetrics {
public readonly jobId: string;
public readonly metrics: string[];
public readonly jobStatus: TrialJobStatus;
public readonly endTimestamp: number;
constructor(jobId : string, metrics : string[], jobStatus : TrialJobStatus, endTimestamp : number) {
this.jobId = jobId;
this.metrics = metrics;
this.jobStatus = jobStatus;
this.endTimestamp = endTimestamp;
}
}
/** /**
* RemoteMachineTrialJobDetail * RemoteMachineTrialJobDetail
*/ */
...@@ -121,7 +106,7 @@ export enum ScheduleResultType { ...@@ -121,7 +106,7 @@ export enum ScheduleResultType {
REQUIRE_EXCEED_TOTAL REQUIRE_EXCEED_TOTAL
} }
export const REMOTEMACHINERUNSHELLFORMAT: string = export const REMOTEMACHINE_RUN_SHELL_FORMAT: string =
`#!/bin/bash `#!/bin/bash
export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0} export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_TRIAL_JOB_ID={1} NNI_OUTPUT_DIR={0}
cd $NNI_SYS_DIR cd $NNI_SYS_DIR
...@@ -129,7 +114,7 @@ echo $$ >{2} ...@@ -129,7 +114,7 @@ echo $$ >{2}
eval {3}{4} 2>{5} eval {3}{4} 2>{5}
echo $? \`date +%s%3N\` >{6}`; echo $? \`date +%s%3N\` >{6}`;
export const HOSTJOBSHELLFORMAT: string = export const HOST_JOB_SHELL_FORMAT: string =
`#!/bin/bash `#!/bin/bash
cd {0} cd {0}
echo $$ >{1} echo $$ >{1}
......
...@@ -43,8 +43,8 @@ import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; ...@@ -43,8 +43,8 @@ import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { GPUScheduler } from './gpuScheduler'; import { GPUScheduler } from './gpuScheduler';
import { MetricsCollector } from './metricsCollector'; import { MetricsCollector } from './metricsCollector';
import { import {
HOSTJOBSHELLFORMAT, RemoteCommandResult, RemoteMachineMeta, HOST_JOB_SHELL_FORMAT, RemoteCommandResult, RemoteMachineMeta,
REMOTEMACHINERUNSHELLFORMAT, RemoteMachineScheduleInfo, RemoteMachineScheduleResult, REMOTEMACHINE_RUN_SHELL_FORMAT, RemoteMachineScheduleInfo, RemoteMachineScheduleResult,
RemoteMachineTrialJobDetail, ScheduleResultType RemoteMachineTrialJobDetail, ScheduleResultType
} from './remoteMachineData'; } from './remoteMachineData';
import { SSHClientUtility } from './sshClientUtility'; import { SSHClientUtility } from './sshClientUtility';
...@@ -427,7 +427,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -427,7 +427,7 @@ class RemoteMachineTrainingService implements TrainingService {
// RemoteMachineRunShellFormat is the run shell format string, // RemoteMachineRunShellFormat is the run shell format string,
// See definition in remoteMachineData.ts // See definition in remoteMachineData.ts
const runScriptContent: string = String.Format( const runScriptContent: string = String.Format(
REMOTEMACHINERUNSHELLFORMAT, REMOTEMACHINE_RUN_SHELL_FORMAT,
trialWorkingFolder, trialWorkingFolder,
trialJobId, trialJobId,
path.join(trialWorkingFolder, '.nni', 'jobpid'), path.join(trialWorkingFolder, '.nni', 'jobpid'),
...@@ -470,7 +470,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -470,7 +470,7 @@ class RemoteMachineTrainingService implements TrainingService {
await cpp.exec(`mkdir -p ${localDir}`); await cpp.exec(`mkdir -p ${localDir}`);
await SSHClientUtility.remoteExeCommand(`mkdir -p ${remoteDir}`, sshClient); await SSHClientUtility.remoteExeCommand(`mkdir -p ${remoteDir}`, sshClient);
const runScriptContent: string = String.Format( const runScriptContent: string = String.Format(
HOSTJOBSHELLFORMAT, remoteDir, path.join(remoteDir, 'jobpid'), form.cmd, path.join(remoteDir, 'code') HOST_JOB_SHELL_FORMAT, remoteDir, path.join(remoteDir, 'jobpid'), form.cmd, path.join(remoteDir, 'code')
); );
await fs.promises.writeFile(path.join(localDir, 'run.sh'), runScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(localDir, 'run.sh'), runScriptContent, { encoding: 'utf8' });
await SSHClientUtility.copyFileToRemote( await SSHClientUtility.copyFileToRemote(
......
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as chai from 'chai';
import * as chaiAsPromised from 'chai-as-promised';
import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';
import * as tmp from 'tmp';
import { cleanupUnitTest, prepareUnitTest, uniqueString } from '../../common/utils';
import { HDFSClientUtility } from '../pai/hdfsClientUtility';
var WebHDFS = require('webhdfs');
var rmdir = require('rmdir');
describe('WebHDFS', function () {
/*
To enable web HDFS client unit test, HDFS information needs to be configured in:
Default/.vscode/hdfsInfo.json, whose content looks like:
{
"user": "user1",
"port": 50070,
"host": "10.0.0.0"
}
*/
let skip: boolean = false;
let testHDFSInfo: any;
let hdfsClient: any;
try {
testHDFSInfo = JSON.parse(fs.readFileSync('../../.vscode/hdfsInfo.json', 'utf8'));
console.log(testHDFSInfo);
hdfsClient = WebHDFS.createClient({
user: testHDFSInfo.user,
port: testHDFSInfo.port,
host: testHDFSInfo.host
});
} catch (err) {
console.log('Please configure rminfo.json to enable remote machine unit test.');
skip = true;
}
before(() => {
chai.should();
chai.use(chaiAsPromised);
tmp.setGracefulCleanup();
prepareUnitTest();
});
after(() => {
cleanupUnitTest();
});
it('Test HDFS utility path functions', async () => {
if (skip) {
return;
}
const testPath : string = '/nni_unittest_' + uniqueString(6);
let exists : boolean = await HDFSClientUtility.pathExists(testPath, hdfsClient);
// The new random named path is expected to not exist
chai.expect(exists).to.be.equals(false);
const mkdirResult : boolean = await HDFSClientUtility.mkdir(testPath, hdfsClient);
// Mkdir is expected to be successful
chai.expect(mkdirResult).to.be.equals(true);
exists = await HDFSClientUtility.pathExists(testPath, hdfsClient);
// The newly created path is expected to exist
chai.expect(exists).to.be.equals(true);
const deleteResult : boolean = await HDFSClientUtility.deletePath(testPath, hdfsClient);
// Delete path is expected to be successful
chai.expect(deleteResult).to.be.equals(true);
exists = await HDFSClientUtility.pathExists(testPath, hdfsClient);
// The deleted path is not expected to exist
chai.expect(exists).to.be.equals(false);
});
it('Test HDFS utility copyFileToHdfs', async() => {
if (skip) {
return;
}
// Prepare local directory and files
const tmpLocalDirectoryPath : string = path.join(os.tmpdir(), 'nni_unittest_dir_' + uniqueString(6));
const tmpDataFilePath : string = path.join(tmpLocalDirectoryPath, 'file_' + uniqueString(6));
const testFileData : string = 'TestContent123';
fs.mkdirSync(tmpLocalDirectoryPath);
fs.writeFileSync(tmpDataFilePath, testFileData);
const testHDFSFilePath : string = '/nni_unittest_' + uniqueString(6);
let exists : boolean = await HDFSClientUtility.pathExists(testHDFSFilePath, hdfsClient);
// The new random named path is expected to not exist
chai.expect(exists).to.be.equals(false);
await HDFSClientUtility.copyFileToHdfs(tmpDataFilePath, testHDFSFilePath, hdfsClient);
exists = await HDFSClientUtility.pathExists(testHDFSFilePath, hdfsClient);
// After copy local file to HDFS, the target file path in HDFS is expected to exist
chai.expect(exists).to.be.equals(true);
const buffer : Buffer = await HDFSClientUtility.readFileFromHDFS(testHDFSFilePath, hdfsClient);
const actualFileData : string = buffer.toString('utf8');
// The file content read from HDFS is expected to equal to the content of local file
chai.expect(actualFileData).to.be.equals(testFileData);
const testHDFSDirPath : string = path.join('/nni_unittest_' + uniqueString(6) + '_dir');
await HDFSClientUtility.copyDirectoryToHdfs(tmpLocalDirectoryPath, testHDFSDirPath, hdfsClient);
const files : any[] = await HDFSClientUtility.readdir(testHDFSDirPath, hdfsClient);
// Expected file count under HDFS target directory is 1
chai.expect(files.length).to.be.equals(1);
// Expected file name under HDFS target directory is equal to local file name
chai.expect(files[0].pathSuffix).to.be.equals(path.parse(tmpDataFilePath).base);
// Cleanup
rmdir(tmpLocalDirectoryPath);
let deleteRestult : boolean = await HDFSClientUtility.deletePath(testHDFSFilePath, hdfsClient);
chai.expect(deleteRestult).to.be.equals(true);
deleteRestult = await HDFSClientUtility.deletePath(testHDFSDirPath, hdfsClient);
chai.expect(deleteRestult).to.be.equals(true);
});
});
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as chai from 'chai';
import * as chaiAsPromised from 'chai-as-promised';
import * as fs from 'fs';
import * as tmp from 'tmp';
import * as component from '../../common/component';
import { cleanupUnitTest, prepareUnitTest } from '../../common/utils';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { PAITrainingService } from '../pai/paiTrainingService';
// TODO: copy mockedTrail.py to local folder
const localCodeDir: string = tmp.dirSync().name
const mockedTrialPath: string = './training_service/test/mockedTrial.py'
fs.copyFileSync(mockedTrialPath, localCodeDir + '/mockedTrial.py')
describe('Unit Test for PAITrainingService', () => {
let skip: boolean = false;
let testPaiClusterInfo: any;
let paiCluster: any;
let paiTrialConfig : any;
try {
testPaiClusterInfo = JSON.parse(fs.readFileSync('../../.vscode/paiCluster.json', 'utf8'));
paiCluster = `{\"userName\":\"${testPaiClusterInfo.userName}\",\"passWord\":\"${testPaiClusterInfo.passWord}\",\"host\":\"${testPaiClusterInfo.host}\"}`;
paiTrialConfig = `{\"command\":\"echo hello && ls\",\"codeDir\":\"/home/desy/nni/examples/trials/mnist",\"gpuNum\":\"1\",
\"cpuNum\":\"1\",\"memoryMB\":\"8196\",\"image\":\"openpai/pai.example.tensorflow\",\"dataDir\":\"\",\"outputDir\":\"\"}`;
} catch (err) {
console.log('Please configure rminfo.json to enable remote machine unit test.');
skip = true;
}
let paiTrainingService: PAITrainingService;
console.log(tmp.dirSync().name);
before(() => {
chai.should();
chai.use(chaiAsPromised);
prepareUnitTest();
});
after(() => {
cleanupUnitTest();
});
beforeEach(() => {
if (skip) {
return;
}
paiTrainingService = component.get(PAITrainingService);
paiTrainingService.run();
});
afterEach(() => {
if (skip) {
return;
}
paiTrainingService.cleanUp();
});
it('Get PAI token', async () => {
if (skip) {
return;
}
console.log(`paiCluster is ${paiCluster}`)
await paiTrainingService.setClusterMetadata(TrialConfigMetadataKey.PAI_CLUSTER_CONFIG, paiCluster);
await paiTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, paiTrialConfig);
try {
const trialDetail = await paiTrainingService.submitTrialJob({jobType : 'TRIAL'});
chai.expect(trialDetail.status).to.be.equals('WAITING');
} catch(error) {
console.log('Submit job failed:' + error);
chai.assert(error)
}
});
});
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment