"examples/vscode:/vscode.git/clone" did not exist on "ede593804169a727e65b6723ec8a075afdf4c8c2"
Unverified Commit 1e511829 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #245 from microsoft/master

merge master
parents e29b58a1 6485ef17
......@@ -173,12 +173,12 @@ install-python-modules:
dev-install-python-modules:
#$(_INFO) Installing Python SDK $(_END)
mkdir -p build
ln -sf ../src/sdk/pynni/nni build
ln -sf ../src/sdk/pycli/nnicli build
ln -sf ../tools/nni_annotation build
ln -sf ../tools/nni_cmd build
ln -sf ../tools/nni_trial_tool build
ln -sf ../tools/nni_gpu_tool build
ln -sfT ../src/sdk/pynni/nni build/nni
ln -sfT ../src/sdk/pycli/nnicli build/nnicli
ln -sfT ../tools/nni_annotation build/nni_annotation
ln -sfT ../tools/nni_cmd build/nni_cmd
ln -sfT ../tools/nni_trial_tool build/nni_trial_tool
ln -sfT ../tools/nni_gpu_tool build/nni_gpu_tool
cp setup.py build/
cp README.md build/
sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' build/setup.py
......@@ -205,16 +205,14 @@ install-node-modules:
.PHONY: dev-install-node-modules
dev-install-node-modules:
#$(_INFO) Installing NNI Package $(_END)
rm -rf $(NNI_PKG_FOLDER)
ln -sf ${PWD}/src/nni_manager/dist $(NNI_PKG_FOLDER)
ln -sfT ${PWD}/src/nni_manager/dist $(NNI_PKG_FOLDER)
cp src/nni_manager/package.json $(NNI_PKG_FOLDER)
sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' $(NNI_PKG_FOLDER)/package.json
ln -sf ${PWD}/src/nni_manager/node_modules $(NNI_PKG_FOLDER)
ln -sf ${PWD}/src/webui/build -t $(NNI_PKG_FOLDER)
mv $(NNI_PKG_FOLDER)/build $(NNI_PKG_FOLDER)/static
ln -sfT ${PWD}/src/nni_manager/node_modules $(NNI_PKG_FOLDER)/node_modules
ln -sfT ${PWD}/src/webui/build $(NNI_PKG_FOLDER)/static
mkdir -p $(NASUI_PKG_FOLDER)
ln -sf ${PWD}/src/nasui/build $(NASUI_PKG_FOLDER)
ln -sf ${PWD}/src/nasui/server.js $(NASUI_PKG_FOLDER)
ln -sfT ${PWD}/src/nasui/build $(NASUI_PKG_FOLDER)/build
ln -sfT ${PWD}/src/nasui/server.js $(NASUI_PKG_FOLDER)/server.js
.PHONY: install-scripts
install-scripts:
......
......@@ -342,7 +342,7 @@ With authors' permission, we listed a set of NNI usage examples and relevant art
Join IM discussion groups:
|Gitter||WeChat|
|----|----|----|
|![image](https://user-images.githubusercontent.com/39592018/80665738-e0574a80-8acc-11ea-91bc-0836dc4cbf89.png)| OR |![image](https://user-images.githubusercontent.com/39592018/80665762-f06f2a00-8acc-11ea-8d22-e461e68e2d9b.png)|
|![image](https://user-images.githubusercontent.com/39592018/80665738-e0574a80-8acc-11ea-91bc-0836dc4cbf89.png)| OR |![image](https://github.com/JSong-Jia/NNI-user-group/blob/master/NNI%20user%20group_3.png)|
## Related Projects
......
......@@ -92,8 +92,18 @@ Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMod
* Required key. Set the mount path in your container used in PAI.
* paiStoragePlugin
* Optional key. Set the storage plugin name used in PAI. If it is not set in trial configuration, it should be set in the config file specified in `paiConfigPath` field.
* command
* Optional key. Set the commands used in PAI container.
* paiConfigPath
* Optional key. Set the file path of pai job configuration, the file is in yaml format.
If users set `paiConfigPath` in NNI's configuration file, no need to specify the fields `command`, `paiStoragePlugin`, `virtualCluster`, `image`, `memoryMB`, `cpuNum`, `gpuNum` in `trial` configuration. These fields will use the values from the config file specified by `paiConfigPath`.
```
Note:
1. The job name in PAI's configuration file will be replaced by a new job name, the new job name is created by NNI, the name format is nni_exp_${this.experimentId}_trial_${trialJobId}.
2. If users set multiple taskRoles in PAI's configuration file, NNI will wrap all of these taksRoles and start multiple tasks in one trial job, users should ensure that only one taskRole report metric to NNI, otherwise there might be some conflict error.
```
Once complete to fill NNI experiment config file and save (for example, save as exp_pai.yml), then run the following command
......
......@@ -23,6 +23,13 @@
"@typescript-eslint/consistent-type-assertions": 0,
"@typescript-eslint/no-inferrable-types": 0,
"no-inner-declarations": 0,
"@typescript-eslint/explicit-function-return-type": "error",
"@typescript-eslint/no-unused-vars": [
"error",
{
"argsIgnorePattern": "^_"
}
],
"@typescript-eslint/no-var-requires": 0
},
"ignorePatterns": [
......
......@@ -98,7 +98,7 @@ class SqlDB implements Database {
this.resolve(this.initTask, err);
} else {
if (createNew) {
this.db.exec(createTables, (error: Error | null) => {
this.db.exec(createTables, (_error: Error | null) => {
this.resolve(this.initTask, err);
});
} else {
......
......@@ -14,7 +14,6 @@
"azure-storage": "^2.10.2",
"chai-as-promised": "^7.1.1",
"child-process-promise": "^2.2.1",
"deepmerge": "^4.2.2",
"express": "^4.16.3",
"express-joi-validator": "^2.0.0",
"js-base64": "^2.4.9",
......
......@@ -60,7 +60,7 @@ class NNIRestHandler {
this.exportData(router);
// Express-joi-validator configuration
router.use((err: any, req: Request, res: Response, next: any) => {
router.use((err: any, _req: Request, res: Response, _next: any) => {
if (err.isBoom) {
this.log.error(err.output.payload);
......
......@@ -131,7 +131,7 @@ class DLTSTrainingService implements TrainingService {
private async statusCheckingLoop(): Promise<void> {
while (!this.stopping) {
const updateDLTSTrialJobs: Promise<void>[] = [];
for (const [trialJobId, dltsTrialJob] of this.trialJobsMap) {
for (const dltsTrialJob of this.trialJobsMap.values()) {
updateDLTSTrialJobs.push(this.getDLTSTrialJobInfo(dltsTrialJob));
}
......@@ -405,7 +405,7 @@ class DLTSTrainingService implements TrainingService {
}
}
public async getClusterMetadata(key: string): Promise<string> {
public async getClusterMetadata(_key: string): Promise<string> {
return '';
}
......@@ -545,7 +545,7 @@ class DLTSTrainingService implements TrainingService {
body: parameterFileMeta
};
await new Promise((resolve, reject) => {
request(req, (err: Error, res: request.Response) => {
request(req, (err: Error, _res: request.Response) => {
if (err) {
reject(err);
} else {
......
......@@ -20,7 +20,7 @@ export namespace AzureStorageClientUtility {
*/
export async function createShare(fileServerClient: any, azureShare: any): Promise<boolean> {
const deferred: Deferred<boolean> = new Deferred<boolean>();
fileServerClient.createShareIfNotExists(azureShare, (error: any, result: any, response: any) => {
fileServerClient.createShareIfNotExists(azureShare, (error: any, _result: any, _response: any) => {
if (error) {
getLogger()
.error(`Create share failed:, ${error}`);
......@@ -41,7 +41,7 @@ export namespace AzureStorageClientUtility {
*/
export async function createDirectory(fileServerClient: azureStorage.FileService, azureFoler: any, azureShare: any): Promise<boolean> {
const deferred: Deferred<boolean> = new Deferred<boolean>();
fileServerClient.createDirectoryIfNotExists(azureShare, azureFoler, (error: any, result: any, response: any) => {
fileServerClient.createDirectoryIfNotExists(azureShare, azureFoler, (error: any, _result: any, _response: any) => {
if (error) {
getLogger()
.error(`Create directory failed:, ${error}`);
......@@ -89,7 +89,7 @@ export namespace AzureStorageClientUtility {
localFilePath: string): Promise<boolean> {
const deferred: Deferred<boolean> = new Deferred<boolean>();
await fileServerClient.createFileFromLocalFile(azureShare, azureDirectory, azureFileName, localFilePath,
(error: any, result: any, response: any) => {
(error: any, _result: any, _response: any) => {
if (error) {
getLogger()
.error(`Upload file failed:, ${error}`);
......@@ -114,7 +114,7 @@ export namespace AzureStorageClientUtility {
localFilePath: string): Promise<boolean> {
const deferred: Deferred<boolean> = new Deferred<boolean>();
await fileServerClient.getFileToStream(azureShare, azureDirectory, azureFileName, fs.createWriteStream(localFilePath),
(error: any, result: any, response: any) => {
(error: any, _result: any, _response: any) => {
if (error) {
getLogger()
.error(`Download file failed:, ${error}`);
......@@ -183,7 +183,7 @@ export namespace AzureStorageClientUtility {
const deferred: Deferred<void> = new Deferred<void>();
await mkDirP(localDirectory);
fileServerClient.listFilesAndDirectoriesSegmented(azureShare, azureDirectory, 'null',
async (error: any, result: any, response: any) => {
async (_error: any, result: any, _response: any) => {
if (('entries' in result) === false) {
getLogger()
.error(`list files failed, can't get entries in result`);
......
......@@ -40,8 +40,8 @@ export class KubernetesJobInfoCollector {
await Promise.all(updateKubernetesTrialJobs);
}
protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubernetesTrialJob: KubernetesTrialJobDetail): Promise<void> {
protected async retrieveSingleTrialJobInfo(_kubernetesCRDClient: KubernetesCRDClient | undefined,
_kubernetesTrialJob: KubernetesTrialJobDetail): Promise<void> {
throw new MethodNotImplementedError();
}
}
......@@ -77,7 +77,7 @@ abstract class KubernetesTrainingService {
public async listTrialJobs(): Promise<TrialJobDetail[]> {
const jobs: TrialJobDetail[] = [];
for (const [key, value] of this.trialJobsMap) {
for (const key of this.trialJobsMap.keys()) {
jobs.push(await this.getTrialJob(key));
}
......@@ -107,7 +107,7 @@ abstract class KubernetesTrainingService {
return false;
}
public getClusterMetadata(key: string): Promise<string> {
public getClusterMetadata(_key: string): Promise<string> {
return Promise.resolve('');
}
......
......@@ -3,7 +3,6 @@
'use strict';
import {TrialConfig} from '../common/trialConfig';
import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
export class PAIClusterConfig {
......
......@@ -62,7 +62,7 @@ export class PAIJobInfoCollector {
};
//TODO : pass in request timeout param?
request(getJobInfoRequest, (error: Error, response: request.Response, body: any) => {
request(getJobInfoRequest, (error: Error, response: request.Response, _body: any) => {
if ((error !== undefined && error !== null) || response.statusCode >= 500) {
this.log.error(`PAI Training service: get job info for trial ${paiTrialJob.id} from PAI Cluster failed!`);
// Queried PAI job info failed, set job status to UNKNOWN
......
......@@ -4,8 +4,6 @@
'use strict';
import { Request, Response, Router } from 'express';
import { Inject } from 'typescript-ioc';
import * as component from '../../common/component';
import { ClusterJobRestServer } from '../common/clusterJobRestServer';
import { PAITrainingService } from './paiTrainingService';
......
......@@ -19,7 +19,6 @@
'use strict';
import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import * as path from 'path';
// tslint:disable-next-line:no-implicit-dependencies
......@@ -29,11 +28,13 @@ import * as component from '../../../common/component';
import { Deferred } from 'ts-deferred';
import { String } from 'typescript-string-operations';
import {
HyperParameters, NNIManagerIpConfig, TrainingService,
TrialJobApplicationForm, TrialJobDetail, TrialJobMetric
HyperParameters, NNIManagerIpConfig,
TrialJobApplicationForm, TrialJobDetail
} from '../../../common/trainingService';
import { delay, generateParamFileName,
getExperimentRootDir, getIPV4Address, getVersion, uniqueString, unixPathJoin } from '../../../common/utils';
import {
generateParamFileName,
getIPV4Address, getVersion, uniqueString
} from '../../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
import { execMkdir, validateCodeDir, execCopydir } from '../../common/util';
......@@ -44,7 +45,6 @@ import { PAIClusterConfig, PAITrialJobDetail } from '../paiConfig';
import { PAIJobRestServer } from '../paiJobRestServer';
const yaml = require('js-yaml');
const deepmerge = require('deepmerge');
/**
* Training Service implementation for OpenPAI (Open Platform for AI)
......@@ -53,9 +53,11 @@ const deepmerge = require('deepmerge');
@component.Singleton
class PAIK8STrainingService extends PAITrainingService {
protected paiTrialConfig: NNIPAIK8STrialConfig | undefined;
private paiJobConfig: undefined;
private nniVersion: string | undefined;
constructor() {
super();
}
public async setClusterMetadata(key: string, value: string): Promise<void> {
......@@ -68,10 +70,10 @@ class PAIK8STrainingService extends PAITrainingService {
this.paiJobRestServer = new PAIJobRestServer(component.get(PAIK8STrainingService));
this.paiClusterConfig = <PAIClusterConfig>JSON.parse(value);
this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host);
if(this.paiClusterConfig.passWord) {
if (this.paiClusterConfig.passWord) {
// Get PAI authentication token
await this.updatePaiToken();
} else if(this.paiClusterConfig.token) {
} else if (this.paiClusterConfig.token) {
this.paiToken = this.paiClusterConfig.token;
}
break;
......@@ -84,9 +86,13 @@ class PAIK8STrainingService extends PAITrainingService {
this.paiTrialConfig = <NNIPAIK8STrialConfig>JSON.parse(value);
// Validate to make sure codeDir doesn't have too many files
await validateCodeDir(this.paiTrialConfig.codeDir);
if (this.paiTrialConfig.paiConfigPath) {
this.paiJobConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8'));
}
break;
case TrialConfigMetadataKey.VERSION_CHECK:
this.versionCheck = (value === 'true' || value === 'True');
this.nniVersion = this.versionCheck ? await getVersion() : '';
break;
case TrialConfigMetadataKey.LOG_COLLECTION:
this.logCollection = value;
......@@ -99,7 +105,7 @@ class PAIK8STrainingService extends PAITrainingService {
this.log.error(`Uknown key: ${key}`);
}
}
// update trial parameters for multi-phase
public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise<TrialJobDetail> {
const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
......@@ -142,71 +148,99 @@ class PAIK8STrainingService extends PAITrainingService {
return trialJobDetail;
}
public generateJobConfigInYamlFormat(trialJobId: string, command: string) {
private generateNNITrialCommand(trialJobDetail: PAITrialJobDetail, command: string): string {
if (this.paiTrialConfig === undefined) {
throw new Error('trial config is not initialized');
}
const jobName = `nni_exp_${this.experimentId}_trial_${trialJobId}`
const paiJobConfig: any = {
protocolVersion: 2,
name: jobName,
type: 'job',
jobRetryCount: 0,
prerequisites: [
{
type: 'dockerimage',
uri: this.paiTrialConfig.image,
name: 'docker_image_0'
}
],
taskRoles: {
taskrole: {
instances: 1,
completion: {
minFailedInstances: 1,
minSucceededInstances: -1
},
taskRetryCount: 0,
dockerImage: 'docker_image_0',
resourcePerInstance: {
gpu: this.paiTrialConfig.gpuNum,
cpu: this.paiTrialConfig.cpuNum,
memoryMB: this.paiTrialConfig.memoryMB
},
commands: [
command
]
}
},
extras: {
'com.microsoft.pai.runtimeplugin': [
{
plugin: this.paiTrialConfig.paiStoragePlugin
}
],
submitFrom: 'submit-job-v2'
}
}
if (this.paiTrialConfig.virtualCluster) {
paiJobConfig.defaults= {
virtualCluster: this.paiTrialConfig.virtualCluster
}
const containerWorkingDir: string = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}/${trialJobDetail.id}`;
const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address();
const nniPaiTrialCommand: string = String.Format(
PAI_K8S_TRIAL_COMMAND_FORMAT,
`${containerWorkingDir}`,
`${containerWorkingDir}/nnioutput`,
trialJobDetail.id,
this.experimentId,
trialJobDetail.form.sequenceId,
this.isMultiPhase,
command,
nniManagerIp,
this.paiRestServerPort,
this.nniVersion,
this.logCollection
)
.replace(/\r\n|\n|\r/gm, '');
return nniPaiTrialCommand;
}
private generateJobConfigInYamlFormat(trialJobDetail: PAITrialJobDetail): any {
if (this.paiTrialConfig === undefined) {
throw new Error('trial config is not initialized');
}
const jobName = `nni_exp_${this.experimentId}_trial_${trialJobDetail.id}`
let nniJobConfig: any = undefined;
if (this.paiTrialConfig.paiConfigPath) {
try {
const additionalPAIConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8'));
//deepmerge(x, y), if an element at the same key is present for both x and y, the value from y will appear in the result.
//refer: https://github.com/TehShrike/deepmerge
const overwriteMerge = (destinationArray: any, sourceArray: any, options: any) => sourceArray;
return yaml.safeDump(deepmerge(additionalPAIConfig, paiJobConfig, { arrayMerge: overwriteMerge }));
} catch (error) {
this.log.error(`Error occurs during loading and merge ${this.paiTrialConfig.paiConfigPath} : ${error}`);
nniJobConfig = this.paiJobConfig;
nniJobConfig.name = jobName;
// Each taskRole will generate new command in NNI's command format
// Each command will be formatted to NNI style
for (const taskRoleIndex in nniJobConfig.taskRoles) {
const commands = nniJobConfig.taskRoles[taskRoleIndex].commands
const nniTrialCommand = this.generateNNITrialCommand(trialJobDetail, commands.join(" && ").replace(/(["'$`\\])/g, '\\$1'));
nniJobConfig.taskRoles[taskRoleIndex].commands = [nniTrialCommand]
}
} else {
return yaml.safeDump(paiJobConfig);
nniJobConfig = {
protocolVersion: 2,
name: jobName,
type: 'job',
jobRetryCount: 0,
prerequisites: [
{
type: 'dockerimage',
uri: this.paiTrialConfig.image,
name: 'docker_image_0'
}
],
taskRoles: {
taskrole: {
instances: 1,
completion: {
minFailedInstances: 1,
minSucceededInstances: -1
},
taskRetryCount: 0,
dockerImage: 'docker_image_0',
resourcePerInstance: {
gpu: this.paiTrialConfig.gpuNum,
cpu: this.paiTrialConfig.cpuNum,
memoryMB: this.paiTrialConfig.memoryMB
},
commands: [
this.generateNNITrialCommand(trialJobDetail, this.paiTrialConfig.command)
]
}
},
extras: {
'com.microsoft.pai.runtimeplugin': [
{
plugin: this.paiTrialConfig.paiStoragePlugin
}
],
submitFrom: 'submit-job-v2'
}
}
if (this.paiTrialConfig.virtualCluster) {
nniJobConfig.defaults = {
virtualCluster: this.paiTrialConfig.virtualCluster
}
}
}
}
return yaml.safeDump(nniJobConfig);
}
protected async submitTrialJobToPAI(trialJobId: string): Promise<boolean> {
const deferred: Deferred<boolean> = new Deferred<boolean>();
......@@ -247,29 +281,8 @@ class PAIK8STrainingService extends PAITrainingService {
//Copy codeDir files to local working folder
await execCopydir(this.paiTrialConfig.codeDir, trialJobDetail.logPath);
const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address();
const version: string = this.versionCheck ? await getVersion() : '';
const containerWorkingDir: string = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}/${trialJobId}`;
const nniPaiTrialCommand: string = String.Format(
PAI_K8S_TRIAL_COMMAND_FORMAT,
`${containerWorkingDir}`,
`${containerWorkingDir}/nnioutput`,
trialJobId,
this.experimentId,
trialJobDetail.form.sequenceId,
this.isMultiPhase,
this.paiTrialConfig.command,
nniManagerIp,
this.paiRestServerPort,
version,
this.logCollection
)
.replace(/\r\n|\n|\r/gm, '');
this.log.info(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`);
const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobId, nniPaiTrialCommand);
//Generate Job Configuration in yaml format
const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobDetail);
this.log.debug(paiJobConfig);
// Step 3. Submit PAI job via Rest call
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
......
......@@ -3,27 +3,21 @@
'use strict';
import * as fs from 'fs';
import * as path from 'path';
import * as request from 'request';
import * as component from '../../common/component';
import { EventEmitter } from 'events';
import { Deferred } from 'ts-deferred';
import { String } from 'typescript-string-operations';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import {
HyperParameters, NNIManagerIpConfig, TrainingService,
NNIManagerIpConfig, TrainingService,
TrialJobApplicationForm, TrialJobDetail, TrialJobMetric
} from '../../common/trainingService';
import { delay, generateParamFileName,
getExperimentRootDir, getIPV4Address, getVersion, uniqueString, unixPathJoin } from '../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { execMkdir, validateCodeDir } from '../common/util';
import { delay } from '../../common/utils';
import { PAIJobInfoCollector } from './paiJobInfoCollector';
import { PAIJobRestServer, ParameterFileMeta } from './paiJobRestServer';
import { PAIJobRestServer } from './paiJobRestServer';
import { PAIClusterConfig, PAITrialJobDetail } from './paiConfig';
/**
......@@ -39,7 +33,7 @@ abstract class PAITrainingService implements TrainingService {
protected paiClusterConfig?: PAIClusterConfig;
protected readonly jobQueue: string[];
protected stopping: boolean = false;
protected paiToken? : string;
protected paiToken?: string;
protected paiTokenUpdateTime?: number;
protected readonly paiTokenUpdateInterval: number;
protected readonly experimentId!: string;
......@@ -81,15 +75,15 @@ abstract class PAITrainingService implements TrainingService {
this.log.info('PAI training service exit.');
}
public async submitTrialJob(form: TrialJobApplicationForm): Promise<any> {
public async submitTrialJob(_form: TrialJobApplicationForm): Promise<any> {
throw new Error('Not implemented!');
}
public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise<TrialJobDetail> {
public async updateTrialJob(_trialJobId: string, _form: TrialJobApplicationForm): Promise<TrialJobDetail> {
throw new Error('Not implemented!');
}
protected async submitTrialJobToPAI(trialJobId: string): Promise<boolean> {
protected async submitTrialJobToPAI(_trialJobId: string): Promise<boolean> {
throw new Error('Not implemented!');
}
......@@ -109,14 +103,14 @@ abstract class PAITrainingService implements TrainingService {
}
}
public async setClusterMetadata(key: string, value: string): Promise<void> {
public async setClusterMetadata(_key: string, _value: string): Promise<void> {
throw new Error('Not implemented!');
}
public async listTrialJobs(): Promise<TrialJobDetail[]> {
const jobs: TrialJobDetail[] = [];
for (const [key, value] of this.trialJobsMap) {
for (const key of this.trialJobsMap.keys()) {
jobs.push(await this.getTrialJob(key));
}
......@@ -150,7 +144,7 @@ abstract class PAITrainingService implements TrainingService {
}
public cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise<void> {
const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
const trialJobDetail: PAITrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if (trialJobDetail === undefined) {
return Promise.reject(new Error(`cancelTrialJob: trial job id ${trialJobId} not found`));
}
......@@ -169,10 +163,10 @@ abstract class PAITrainingService implements TrainingService {
const stopJobRequest: request.Options = {
uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v1/user/${this.paiClusterConfig.userName}\
/jobs/${trialJobDetail.paiJobName}/executionType`,
/jobs/${trialJobDetail.paiJobName}/executionType`,
method: 'PUT',
json: true,
body: {value: 'STOP'},
body: { value: 'STOP' },
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${this.paiToken}`
......@@ -183,11 +177,11 @@ abstract class PAITrainingService implements TrainingService {
trialJobDetail.isEarlyStopped = isEarlyStopped;
const deferred: Deferred<void> = new Deferred<void>();
request(stopJobRequest, (error: Error, response: request.Response, body: any) => {
request(stopJobRequest, (error: Error, response: request.Response, _body: any) => {
if ((error !== undefined && error !== null) || response.statusCode >= 400) {
this.log.error(`PAI Training service: stop trial ${trialJobId} to PAI Cluster failed!`);
deferred.reject((error !== undefined && error !== null) ? error.message :
`Stop trial failed, http code: ${response.statusCode}`);
`Stop trial failed, http code: ${response.statusCode}`);
} else {
deferred.resolve();
}
......@@ -196,7 +190,7 @@ abstract class PAITrainingService implements TrainingService {
return deferred.promise;
}
public getClusterMetadata(key: string): Promise<string> {
public getClusterMetadata(_key: string): Promise<string> {
throw new Error('Not implemented!');
}
......@@ -236,7 +230,7 @@ abstract class PAITrainingService implements TrainingService {
protected async statusCheckingLoop(): Promise<void> {
while (!this.stopping) {
if(this.paiClusterConfig && this.paiClusterConfig.passWord) {
if (this.paiClusterConfig && this.paiClusterConfig.passWord) {
try {
await this.updatePaiToken();
} catch (error) {
......@@ -302,7 +296,7 @@ abstract class PAITrainingService implements TrainingService {
});
let timeoutId: NodeJS.Timer;
const timeoutDelay: Promise<void> = new Promise<void>((resolve: Function, reject: Function): void => {
const timeoutDelay: Promise<void> = new Promise<void>((_resolve: Function, reject: Function): void => {
// Set timeout and reject the promise once reach timeout (5 seconds)
timeoutId = setTimeout(
() => reject(new Error('Get PAI token timeout. Please check your PAI cluster.')),
......
......@@ -3,8 +3,6 @@
'use strict';
import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../../common/trainingService';
export const PAI_INSTALL_NNI_SHELL_FORMAT: string =
`#!/bin/bash
if python3 -c 'import nni' > /dev/null 2>&1; then
......
......@@ -8,24 +8,22 @@ import * as path from 'path';
import * as request from 'request';
import * as component from '../../../common/component';
import { EventEmitter } from 'events';
import { Deferred } from 'ts-deferred';
import { String } from 'typescript-string-operations';
import { getExperimentId } from '../../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../../common/log';
import {
HyperParameters, NNIManagerIpConfig, TrainingService,
TrialJobApplicationForm, TrialJobDetail, TrialJobMetric
HyperParameters, NNIManagerIpConfig,
TrialJobApplicationForm, TrialJobDetail
} from '../../../common/trainingService';
import { delay, generateParamFileName,
getExperimentRootDir, getIPV4Address, getVersion, uniqueString, unixPathJoin } from '../../../common/utils';
import {
generateParamFileName,
getExperimentRootDir, getIPV4Address, getVersion, uniqueString, unixPathJoin
} from '../../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
import { execMkdir, validateCodeDir } from '../../common/util';
import { HDFSClientUtility } from './hdfsClientUtility';
import { NNIPAITrialConfig, PAIJobConfig, PAITaskRole } from './paiYarnConfig';
import { PAI_LOG_PATH_FORMAT, PAI_TRIAL_COMMAND_FORMAT } from './paiYarnData';
import { PAIJobInfoCollector } from '../paiJobInfoCollector';
import { PAITrainingService } from '../paiTrainingService';
import { PAIClusterConfig, PAITrialJobDetail } from '../paiConfig';
......@@ -65,7 +63,7 @@ class PAIYarnTrainingService extends PAITrainingService {
PAI_LOG_PATH_FORMAT,
this.paiClusterConfig.host,
hdfsOutputDir
);
);
const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail(
trialJobId,
......@@ -99,13 +97,13 @@ class PAIYarnTrainingService extends PAITrainingService {
port: 80,
path: '/webhdfs/api/v1',
host: this.paiClusterConfig.host
});
this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host);
if(this.paiClusterConfig.passWord) {
if (this.paiClusterConfig.passWord) {
// Get PAI authentication token
await this.updatePaiToken();
} else if(this.paiClusterConfig.token) {
} else if (this.paiClusterConfig.token) {
this.paiToken = this.paiClusterConfig.token;
} else {
throw new Error('pai cluster config format error, please set password or token!');
......@@ -121,14 +119,14 @@ class PAIYarnTrainingService extends PAITrainingService {
// Validate to make sure codeDir doesn't have too many files
await validateCodeDir(this.paiTrialConfig.codeDir);
// Copy experiment files from local folder to HDFS
this.copyExpCodeDirPromise = HDFSClientUtility.copyDirectoryToHdfs(
this.paiTrialConfig.codeDir,
HDFSClientUtility.getHdfsExpCodeDir(this.paiClusterConfig.userName),
this.hdfsClient
);
// Upload authFile to hdfs
if (this.paiTrialConfig.authFile) {
this.authFileHdfsPath = unixPathJoin(HDFSClientUtility.hdfsExpRootDir(this.paiClusterConfig.userName), 'authFile');
......@@ -224,7 +222,7 @@ class PAIYarnTrainingService extends PAITrainingService {
version,
this.logCollection
)
.replace(/\r\n|\n|\r/gm, '');
.replace(/\r\n|\n|\r/gm, '');
this.log.info(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`);
const paiTaskRoles: PAITaskRole[] = [
......@@ -283,10 +281,8 @@ class PAIYarnTrainingService extends PAITrainingService {
Authorization: `Bearer ${this.paiToken}`
}
};
request(submitJobRequest, (error: Error, response: request.Response, body: any) => {
request(submitJobRequest, (error: Error, response: request.Response, _body: any) => {
if ((error !== undefined && error !== null) || response.statusCode >= 400) {
const errorMessage: string = (error !== undefined && error !== null) ? error.message :
`Submit trial ${trialJobId} failed, http code:${response.statusCode}, http body: ${response.body.message}`;
trialJobDetail.status = 'FAILED';
deferred.resolve(true);
} else {
......@@ -343,7 +339,7 @@ class PAIYarnTrainingService extends PAITrainingService {
json: true,
body: parameterFileMeta
};
request(req, (err: Error, res: request.Response) => {
request(req, (err: Error, _res: request.Response) => {
if (err) {
deferred.reject(err);
} else {
......
......@@ -351,7 +351,7 @@ class RemoteMachineTrainingService implements TrainingService {
* Get culster metadata
* @param key metadata key
*/
public async getClusterMetadata(key: string): Promise<string> {
public async getClusterMetadata(_key: string): Promise<string> {
return "";
}
......
......@@ -1332,11 +1332,6 @@ deepmerge@^2.1.1:
version "2.2.1"
resolved "https://registry.yarnpkg.com/deepmerge/-/deepmerge-2.2.1.tgz#5d3ff22a01c00f645405a2fbc17d0778a1801170"
deepmerge@^4.2.2:
version "4.2.2"
resolved "https://registry.yarnpkg.com/deepmerge/-/deepmerge-4.2.2.tgz#44d2ea3679b8f4d4ffba33f03d865fc1e7bf4955"
integrity sha512-FJ3UgI4gIl+PHZm53knsuSFpE+nESMr7M4v9QcgB7S63Kj/6WqMiFQJpBBYz1Pt+66bZpP3Q7Lye0Oo9MPKEdg==
default-require-extensions@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/default-require-extensions/-/default-require-extensions-3.0.0.tgz#e03f93aac9b2b6443fc52e5e4a37b3ad9ad8df96"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment