Unverified Commit 543239c6 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #220 from microsoft/master

merge master
parents 32efaa36 659480f2
...@@ -10,7 +10,6 @@ import * as os from 'os'; ...@@ -10,7 +10,6 @@ import * as os from 'os';
import * as path from 'path'; import * as path from 'path';
import { String } from 'typescript-string-operations'; import { String } from 'typescript-string-operations';
import { countFilesRecursively, getNewLine, validateFileNameRecursively } from '../../common/utils'; import { countFilesRecursively, getNewLine, validateFileNameRecursively } from '../../common/utils';
import { file } from '../../node_modules/@types/tmp';
import { GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData'; import { GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData';
/** /**
...@@ -19,8 +18,7 @@ import { GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData'; ...@@ -19,8 +18,7 @@ import { GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData';
* @param codeDir codeDir in nni config file * @param codeDir codeDir in nni config file
* @returns file number under codeDir * @returns file number under codeDir
*/ */
// tslint:disable: no-redundant-jsdoc export async function validateCodeDir(codeDir: string): Promise<number> {
export async function validateCodeDir(codeDir: string) : Promise<number> {
let fileCount: number | undefined; let fileCount: number | undefined;
let fileNameValid: boolean = true; let fileNameValid: boolean = true;
try { try {
......
...@@ -11,7 +11,6 @@ import { String } from 'typescript-string-operations'; ...@@ -11,7 +11,6 @@ import { String } from 'typescript-string-operations';
import { getLogger } from '../../common/log'; import { getLogger } from '../../common/log';
import { mkDirP } from '../../common/utils'; import { mkDirP } from '../../common/utils';
// tslint:disable: no-redundant-jsdoc no-any no-unsafe-any
export namespace AzureStorageClientUtility { export namespace AzureStorageClientUtility {
/** /**
...@@ -66,7 +65,7 @@ export namespace AzureStorageClientUtility { ...@@ -66,7 +65,7 @@ export namespace AzureStorageClientUtility {
let rootDirectory: string = ''; let rootDirectory: string = '';
for (const directory of directories) { for (const directory of directories) {
rootDirectory += directory; rootDirectory += directory;
let result:boolean = await createDirectory(fileServerClient, rootDirectory, azureShare); const result: boolean = await createDirectory(fileServerClient, rootDirectory, azureShare);
if (!result) { if (!result) {
deferred.resolve(false); deferred.resolve(false);
return deferred.promise; return deferred.promise;
...@@ -114,7 +113,6 @@ export namespace AzureStorageClientUtility { ...@@ -114,7 +113,6 @@ export namespace AzureStorageClientUtility {
async function downloadFile(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any, async function downloadFile(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any,
localFilePath: string): Promise<boolean> { localFilePath: string): Promise<boolean> {
const deferred: Deferred<boolean> = new Deferred<boolean>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
// tslint:disable-next-line:non-literal-fs-path
await fileServerClient.getFileToStream(azureShare, azureDirectory, azureFileName, fs.createWriteStream(localFilePath), await fileServerClient.getFileToStream(azureShare, azureDirectory, azureFileName, fs.createWriteStream(localFilePath),
(error: any, result: any, response: any) => { (error: any, result: any, response: any) => {
if (error) { if (error) {
...@@ -136,12 +134,11 @@ export namespace AzureStorageClientUtility { ...@@ -136,12 +134,11 @@ export namespace AzureStorageClientUtility {
* @param azureShare : the azure share used * @param azureShare : the azure share used
* @param localDirectory : local directory to be uploaded * @param localDirectory : local directory to be uploaded
*/ */
// tslint:disable:non-literal-fs-path
export async function uploadDirectory(fileServerClient: azureStorage.FileService, azureDirectory: string, azureShare: any, export async function uploadDirectory(fileServerClient: azureStorage.FileService, azureDirectory: string, azureShare: any,
localDirectory: string): Promise<boolean> { localDirectory: string): Promise<boolean> {
const deferred: Deferred<boolean> = new Deferred<boolean>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
const fileNameArray: string[] = fs.readdirSync(localDirectory); const fileNameArray: string[] = fs.readdirSync(localDirectory);
let result: boolean = await createDirectoryRecursive(fileServerClient, azureDirectory, azureShare); const result: boolean = await createDirectoryRecursive(fileServerClient, azureDirectory, azureShare);
if (!result) { if (!result) {
deferred.resolve(false); deferred.resolve(false);
return deferred.promise; return deferred.promise;
...@@ -221,4 +218,3 @@ export namespace AzureStorageClientUtility { ...@@ -221,4 +218,3 @@ export namespace AzureStorageClientUtility {
return deferred.promise; return deferred.promise;
} }
} }
// tslint:enable: no-redundant-jsdoc no-any no-unsafe-any
...@@ -6,19 +6,6 @@ ...@@ -6,19 +6,6 @@
import * as fs from 'fs'; import * as fs from 'fs';
import { GeneralK8sClient, KubernetesCRDClient } from '../kubernetesApiClient'; import { GeneralK8sClient, KubernetesCRDClient } from '../kubernetesApiClient';
/**
* FrameworkController Client
*/
class FrameworkControllerClientFactory {
/**
* Factory method to generate operator client
*/
// tslint:disable-next-line:function-name
public static createClient(): KubernetesCRDClient {
return new FrameworkControllerClientV1();
}
}
/** /**
* FrameworkController ClientV1 * FrameworkController ClientV1
*/ */
...@@ -26,7 +13,6 @@ class FrameworkControllerClientV1 extends KubernetesCRDClient { ...@@ -26,7 +13,6 @@ class FrameworkControllerClientV1 extends KubernetesCRDClient {
/** /**
* constructor, to initialize frameworkcontroller CRD definition * constructor, to initialize frameworkcontroller CRD definition
*/ */
// tslint:disable: no-unsafe-any no-any
public constructor() { public constructor() {
super(); super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json', 'utf8')); this.crdSchema = JSON.parse(fs.readFileSync('./config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json', 'utf8'));
...@@ -36,11 +22,22 @@ class FrameworkControllerClientV1 extends KubernetesCRDClient { ...@@ -36,11 +22,22 @@ class FrameworkControllerClientV1 extends KubernetesCRDClient {
protected get operator(): any { protected get operator(): any {
return this.client.apis['frameworkcontroller.microsoft.com'].v1.namespaces('default').frameworks; return this.client.apis['frameworkcontroller.microsoft.com'].v1.namespaces('default').frameworks;
} }
// tslint:enable: no-unsafe-any no-any
public get containerName(): string { public get containerName(): string {
return 'framework'; return 'framework';
} }
} }
/**
* FrameworkController Client
*/
class FrameworkControllerClientFactory {
/**
* Factory method to generate operator client
*/
public static createClient(): KubernetesCRDClient {
return new FrameworkControllerClientV1();
}
}
export { FrameworkControllerClientFactory, GeneralK8sClient }; export { FrameworkControllerClientFactory, GeneralK8sClient };
...@@ -9,7 +9,6 @@ import { AzureStorage, KeyVaultConfig, KubernetesClusterConfig, KubernetesCluste ...@@ -9,7 +9,6 @@ import { AzureStorage, KeyVaultConfig, KubernetesClusterConfig, KubernetesCluste
KubernetesStorageKind, KubernetesTrialConfig, KubernetesTrialConfigTemplate, NFSConfig, StorageConfig KubernetesStorageKind, KubernetesTrialConfig, KubernetesTrialConfigTemplate, NFSConfig, StorageConfig
} from '../kubernetesConfig'; } from '../kubernetesConfig';
// tslint:disable:completed-docs
export class FrameworkAttemptCompletionPolicy { export class FrameworkAttemptCompletionPolicy {
public readonly minFailedTaskCount: number; public readonly minFailedTaskCount: number;
public readonly minSucceededTaskCount: number; public readonly minSucceededTaskCount: number;
...@@ -26,7 +25,7 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi ...@@ -26,7 +25,7 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi
public readonly frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy; public readonly frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy;
public readonly name: string; public readonly name: string;
public readonly taskNum: number; public readonly taskNum: number;
constructor(taskNum: number, command : string, gpuNum : number, constructor(taskNum: number, command: string, gpuNum: number,
cpuNum: number, memoryMB: number, image: string, cpuNum: number, memoryMB: number, image: string,
frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy, privateRegistryFilePath?: string | undefined) { frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy, privateRegistryFilePath?: string | undefined) {
super(command, gpuNum, cpuNum, memoryMB, image, privateRegistryFilePath); super(command, gpuNum, cpuNum, memoryMB, image, privateRegistryFilePath);
...@@ -54,7 +53,6 @@ export class FrameworkControllerClusterConfig extends KubernetesClusterConfig { ...@@ -54,7 +53,6 @@ export class FrameworkControllerClusterConfig extends KubernetesClusterConfig {
} }
} }
// tslint:disable:function-name
export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfigNFS { export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfigNFS {
public readonly serviceAccountName: string; public readonly serviceAccountName: string;
constructor( constructor(
......
...@@ -17,7 +17,7 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec ...@@ -17,7 +17,7 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
} }
protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined, protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> { kubernetesTrialJob: KubernetesTrialJobDetail): Promise<void> {
if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) { if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) {
return Promise.resolve(); return Promise.resolve();
} }
...@@ -26,7 +26,6 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec ...@@ -26,7 +26,6 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
return Promise.reject('kubernetesCRDClient is undefined'); return Promise.reject('kubernetesCRDClient is undefined');
} }
// tslint:disable-next-line:no-any
let kubernetesJobInfo: any; let kubernetesJobInfo: any;
try { try {
kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName); kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName);
...@@ -37,9 +36,9 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec ...@@ -37,9 +36,9 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
return Promise.resolve(); return Promise.resolve();
} }
// tslint:disable: no-unsafe-any
if (kubernetesJobInfo.status && kubernetesJobInfo.status.state) { if (kubernetesJobInfo.status && kubernetesJobInfo.status.state) {
const frameworkJobType: FrameworkControllerJobStatus = <FrameworkControllerJobStatus>kubernetesJobInfo.status.state; const frameworkJobType: FrameworkControllerJobStatus = <FrameworkControllerJobStatus>kubernetesJobInfo.status.state;
/* eslint-disable require-atomic-updates */
switch (frameworkJobType) { switch (frameworkJobType) {
case 'AttemptCreationPending': case 'AttemptCreationPending':
case 'AttemptCreationRequested': case 'AttemptCreationRequested':
...@@ -52,8 +51,8 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec ...@@ -52,8 +51,8 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
kubernetesTrialJob.startTime = Date.parse(<string>kubernetesJobInfo.status.startTime); kubernetesTrialJob.startTime = Date.parse(<string>kubernetesJobInfo.status.startTime);
} }
break; break;
case 'Completed': case 'Completed': {
const completedJobType : FrameworkControllerJobCompleteStatus = const completedJobType: FrameworkControllerJobCompleteStatus =
<FrameworkControllerJobCompleteStatus>kubernetesJobInfo.status.attemptStatus.completionStatus.type.name; <FrameworkControllerJobCompleteStatus>kubernetesJobInfo.status.attemptStatus.completionStatus.type.name;
switch (completedJobType) { switch (completedJobType) {
case 'Succeeded': case 'Succeeded':
...@@ -66,11 +65,12 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec ...@@ -66,11 +65,12 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
} }
kubernetesTrialJob.endTime = Date.parse(<string>kubernetesJobInfo.status.completionTime); kubernetesTrialJob.endTime = Date.parse(<string>kubernetesJobInfo.status.completionTime);
break; break;
}
default: default:
} }
/* eslint-enable require-atomic-updates */
} }
return Promise.resolve(); return Promise.resolve();
} }
// tslint:enable: no-unsafe-any
} }
...@@ -15,7 +15,6 @@ import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from ...@@ -15,7 +15,6 @@ import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
import { validateCodeDir } from '../../common/util'; import { validateCodeDir } from '../../common/util';
import { AzureStorageClientUtility } from '../azureStorageClientUtils';
import { NFSConfig } from '../kubernetesConfig'; import { NFSConfig } from '../kubernetesConfig';
import { KubernetesTrialJobDetail } from '../kubernetesData'; import { KubernetesTrialJobDetail } from '../kubernetesData';
import { KubernetesTrainingService } from '../kubernetesTrainingService'; import { KubernetesTrainingService } from '../kubernetesTrainingService';
...@@ -102,7 +101,6 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -102,7 +101,6 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
this.trialJobsMap.set(trialJobId, trialJobDetail); this.trialJobsMap.set(trialJobId, trialJobDetail);
// Create frameworkcontroller job based on generated frameworkcontroller job resource config // Create frameworkcontroller job based on generated frameworkcontroller job resource config
// tslint:disable-next-line:no-any
const frameworkcontrollerJobConfig: any = await this.prepareFrameworkControllerConfig( const frameworkcontrollerJobConfig: any = await this.prepareFrameworkControllerConfig(
trialJobId, trialWorkingFolder, frameworkcontrollerJobName); trialJobId, trialWorkingFolder, frameworkcontrollerJobName);
await this.kubernetesCRDClient.createKubernetesJob(frameworkcontrollerJobConfig); await this.kubernetesCRDClient.createKubernetesJob(frameworkcontrollerJobConfig);
...@@ -113,13 +111,12 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -113,13 +111,12 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
return Promise.resolve(trialJobDetail); return Promise.resolve(trialJobDetail);
} }
// tslint:disable:no-redundant-jsdoc no-any no-unsafe-any
public async setClusterMetadata(key: string, value: string): Promise<void> { public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) { switch (key) {
case TrialConfigMetadataKey.NNI_MANAGER_IP: case TrialConfigMetadataKey.NNI_MANAGER_IP:
this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value); this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value);
break; break;
case TrialConfigMetadataKey.FRAMEWORKCONTROLLER_CLUSTER_CONFIG: case TrialConfigMetadataKey.FRAMEWORKCONTROLLER_CLUSTER_CONFIG: {
const frameworkcontrollerClusterJsonObject: any = JSON.parse(value); const frameworkcontrollerClusterJsonObject: any = JSON.parse(value);
this.fcClusterConfig = FrameworkControllerClusterConfigFactory this.fcClusterConfig = FrameworkControllerClusterConfigFactory
.generateFrameworkControllerClusterConfig(frameworkcontrollerClusterJsonObject); .generateFrameworkControllerClusterConfig(frameworkcontrollerClusterJsonObject);
...@@ -130,9 +127,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -130,9 +127,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
this.azureStorageShare = azureFrameworkControllerClusterConfig.azureStorage.azureShare; this.azureStorageShare = azureFrameworkControllerClusterConfig.azureStorage.azureShare;
await this.createAzureStorage( await this.createAzureStorage(
azureFrameworkControllerClusterConfig.keyVault.vaultName, azureFrameworkControllerClusterConfig.keyVault.vaultName,
azureFrameworkControllerClusterConfig.keyVault.name, azureFrameworkControllerClusterConfig.keyVault.name
azureFrameworkControllerClusterConfig.azureStorage.accountName,
azureFrameworkControllerClusterConfig.azureStorage.azureShare
); );
} else if (this.fcClusterConfig.storageType === 'nfs') { } else if (this.fcClusterConfig.storageType === 'nfs') {
const nfsFrameworkControllerClusterConfig: FrameworkControllerClusterConfigNFS = const nfsFrameworkControllerClusterConfig: FrameworkControllerClusterConfigNFS =
...@@ -144,7 +139,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -144,7 +139,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
} }
this.kubernetesCRDClient = FrameworkControllerClientFactory.createClient(); this.kubernetesCRDClient = FrameworkControllerClientFactory.createClient();
break; break;
case TrialConfigMetadataKey.TRIAL_CONFIG: }
case TrialConfigMetadataKey.TRIAL_CONFIG: {
const frameworkcontrollerTrialJsonObjsect: any = JSON.parse(value); const frameworkcontrollerTrialJsonObjsect: any = JSON.parse(value);
this.fcTrialConfig = new FrameworkControllerTrialConfig( this.fcTrialConfig = new FrameworkControllerTrialConfig(
...@@ -161,6 +157,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -161,6 +157,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
return Promise.reject(new Error(error)); return Promise.reject(new Error(error));
} }
break; break;
}
case TrialConfigMetadataKey.VERSION_CHECK: case TrialConfigMetadataKey.VERSION_CHECK:
this.versionCheck = (value === 'true' || value === 'True'); this.versionCheck = (value === 'true' || value === 'True');
break; break;
...@@ -172,7 +169,6 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -172,7 +169,6 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
return Promise.resolve(); return Promise.resolve();
} }
// tslint:enable: no-any no-unsafe-any
/** /**
* upload code files to nfs or azureStroage * upload code files to nfs or azureStroage
...@@ -237,7 +233,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -237,7 +233,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`); await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
const installScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT; const installScriptContent: string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files // Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), installScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), installScriptContent, { encoding: 'utf8' });
// Create tmp trial working folder locally. // Create tmp trial working folder locally.
...@@ -251,14 +247,12 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -251,14 +247,12 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
} }
// Write file content ( parameter.cfg ) to local tmp folders // Write file content ( parameter.cfg ) to local tmp folders
const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form);
if (form !== undefined) { if (form !== undefined) {
await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(form.hyperParameters)), await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(form.hyperParameters)),
form.hyperParameters.value, { encoding: 'utf8' }); form.hyperParameters.value, { encoding: 'utf8' });
} }
} }
// tslint:disable: no-any no-unsafe-any
private async prepareFrameworkControllerConfig(trialJobId: string, trialWorkingFolder: string, frameworkcontrollerJobName: string): private async prepareFrameworkControllerConfig(trialJobId: string, trialWorkingFolder: string, frameworkcontrollerJobName: string):
Promise<any> { Promise<any> {
...@@ -266,7 +260,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -266,7 +260,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
throw new Error('frameworkcontroller trial config is not initialized'); throw new Error('frameworkcontroller trial config is not initialized');
} }
const podResources : any = []; const podResources: any = [];
for (const taskRole of this.fcTrialConfig.taskRoles) { for (const taskRole of this.fcTrialConfig.taskRoles) {
const resource: any = {}; const resource: any = {};
resource.requests = this.generatePodResource(taskRole.memoryMB, taskRole.cpuNum, taskRole.gpuNum); resource.requests = this.generatePodResource(taskRole.memoryMB, taskRole.cpuNum, taskRole.gpuNum);
...@@ -300,7 +294,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -300,7 +294,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
* @param podResources pod template * @param podResources pod template
*/ */
private async generateFrameworkControllerJobConfig(trialJobId: string, trialWorkingFolder: string, private async generateFrameworkControllerJobConfig(trialJobId: string, trialWorkingFolder: string,
frameworkcontrollerJobName : string, podResources : any) : Promise<any> { frameworkcontrollerJobName: string, podResources: any): Promise<any> {
if (this.fcClusterConfig === undefined) { if (this.fcClusterConfig === undefined) {
throw new Error('frameworkcontroller Cluster config is not initialized'); throw new Error('frameworkcontroller Cluster config is not initialized');
} }
...@@ -424,7 +418,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -424,7 +418,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}] }]
}]; }];
let spec: any = { const spec: any = {
containers: containers, containers: containers,
initContainers: initContainers, initContainers: initContainers,
restartPolicy: 'OnFailure', restartPolicy: 'OnFailure',
...@@ -449,7 +443,6 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -449,7 +443,6 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
} }
}; };
} }
// tslint:enable: no-any no-unsafe-any
} }
export { FrameworkControllerTrainingService }; export { FrameworkControllerTrainingService };
...@@ -8,7 +8,6 @@ import { GeneralK8sClient, KubernetesCRDClient } from '../kubernetesApiClient'; ...@@ -8,7 +8,6 @@ import { GeneralK8sClient, KubernetesCRDClient } from '../kubernetesApiClient';
import { KubeflowOperator } from './kubeflowConfig'; import { KubeflowOperator } from './kubeflowConfig';
// tslint:disable: no-unsafe-any no-any completed-docs
class TFOperatorClientV1Alpha2 extends KubernetesCRDClient { class TFOperatorClientV1Alpha2 extends KubernetesCRDClient {
/** /**
* constructor, to initialize tfjob CRD definition * constructor, to initialize tfjob CRD definition
...@@ -130,7 +129,6 @@ class KubeflowOperatorClientFactory { ...@@ -130,7 +129,6 @@ class KubeflowOperatorClientFactory {
/** /**
* Factory method to generate operator client * Factory method to generate operator client
*/ */
// tslint:disable-next-line:function-name
public static createClient(kubeflowOperator: KubeflowOperator, operatorApiVersion: string): KubernetesCRDClient { public static createClient(kubeflowOperator: KubeflowOperator, operatorApiVersion: string): KubernetesCRDClient {
switch (kubeflowOperator) { switch (kubeflowOperator) {
case 'tf-operator': { case 'tf-operator': {
...@@ -169,5 +167,4 @@ class KubeflowOperatorClientFactory { ...@@ -169,5 +167,4 @@ class KubeflowOperatorClientFactory {
} }
} }
// tslint:enable: no-unsafe-any
export { KubeflowOperatorClientFactory, GeneralK8sClient }; export { KubeflowOperatorClientFactory, GeneralK8sClient };
...@@ -26,7 +26,6 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig { ...@@ -26,7 +26,6 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig {
} }
} }
// tslint:disable:completed-docs
export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS { export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
public readonly operator: KubeflowOperator; public readonly operator: KubeflowOperator;
constructor( constructor(
...@@ -43,7 +42,6 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS { ...@@ -43,7 +42,6 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
return 'nfs'; return 'nfs';
} }
// tslint:disable-next-line:function-name
public static getInstance(jsonObject: object): KubeflowClusterConfigNFS { public static getInstance(jsonObject: object): KubeflowClusterConfigNFS {
const kubeflowClusterConfigObjectNFS: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>jsonObject; const kubeflowClusterConfigObjectNFS: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>jsonObject;
assert (kubeflowClusterConfigObjectNFS !== undefined); assert (kubeflowClusterConfigObjectNFS !== undefined);
...@@ -75,7 +73,6 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure { ...@@ -75,7 +73,6 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure {
return 'azureStorage'; return 'azureStorage';
} }
// tslint:disable-next-line:function-name
public static getInstance(jsonObject: object): KubeflowClusterConfigAzure { public static getInstance(jsonObject: object): KubeflowClusterConfigAzure {
const kubeflowClusterConfigObjectAzure: KubeflowClusterConfigAzure = <KubeflowClusterConfigAzure>jsonObject; const kubeflowClusterConfigObjectAzure: KubeflowClusterConfigAzure = <KubeflowClusterConfigAzure>jsonObject;
...@@ -91,7 +88,6 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure { ...@@ -91,7 +88,6 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure {
export class KubeflowClusterConfigFactory { export class KubeflowClusterConfigFactory {
// tslint:disable-next-line:function-name
public static generateKubeflowClusterConfig(jsonObject: object): KubeflowClusterConfig { public static generateKubeflowClusterConfig(jsonObject: object): KubeflowClusterConfig {
const storageConfig: StorageConfig = <StorageConfig>jsonObject; const storageConfig: StorageConfig = <StorageConfig>jsonObject;
if (storageConfig === undefined) { if (storageConfig === undefined) {
...@@ -118,7 +114,7 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig { ...@@ -118,7 +114,7 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate { export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate {
public readonly replicas: number; public readonly replicas: number;
constructor(replicas: number, command : string, gpuNum : number, constructor(replicas: number, command: string, gpuNum: number,
cpuNum: number, memoryMB: number, image: string, privateRegistryAuthPath?: string) { cpuNum: number, memoryMB: number, image: string, privateRegistryAuthPath?: string) {
super(command, gpuNum, cpuNum, memoryMB, image, privateRegistryAuthPath); super(command, gpuNum, cpuNum, memoryMB, image, privateRegistryAuthPath);
this.replicas = replicas; this.replicas = replicas;
...@@ -156,8 +152,6 @@ export class KubeflowTrialConfigPytorch extends KubeflowTrialConfig { ...@@ -156,8 +152,6 @@ export class KubeflowTrialConfigPytorch extends KubeflowTrialConfig {
} }
export class KubeflowTrialConfigFactory { export class KubeflowTrialConfigFactory {
// tslint:disable-next-line:function-name
public static generateKubeflowTrialConfig(jsonObject: object, operator: KubeflowOperator): KubeflowTrialConfig { public static generateKubeflowTrialConfig(jsonObject: object, operator: KubeflowOperator): KubeflowTrialConfig {
if (operator === 'tf-operator') { if (operator === 'tf-operator') {
const kubeflowTrialConfigObject: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>jsonObject; const kubeflowTrialConfigObject: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>jsonObject;
......
...@@ -17,7 +17,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector { ...@@ -17,7 +17,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector {
} }
protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined, protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> { kubernetesTrialJob: KubernetesTrialJobDetail): Promise<void> {
if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) { if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) {
return Promise.resolve(); return Promise.resolve();
} }
...@@ -26,7 +26,6 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector { ...@@ -26,7 +26,6 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector {
return Promise.reject('kubernetesCRDClient is undefined'); return Promise.reject('kubernetesCRDClient is undefined');
} }
// tslint:disable:no-any no-unsafe-any
let kubernetesJobInfo: any; let kubernetesJobInfo: any;
try { try {
kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName); kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName);
...@@ -37,10 +36,10 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector { ...@@ -37,10 +36,10 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector {
//This is not treat as a error status //This is not treat as a error status
return Promise.resolve(); return Promise.resolve();
} }
/* eslint-disable require-atomic-updates */
if (kubernetesJobInfo.status && kubernetesJobInfo.status.conditions) { if (kubernetesJobInfo.status && kubernetesJobInfo.status.conditions) {
const latestCondition: any = kubernetesJobInfo.status.conditions[kubernetesJobInfo.status.conditions.length - 1]; const latestCondition: any = kubernetesJobInfo.status.conditions[kubernetesJobInfo.status.conditions.length - 1];
const tfJobType : KubeflowJobStatus = <KubeflowJobStatus>latestCondition.type; const tfJobType: KubeflowJobStatus = <KubeflowJobStatus>latestCondition.type;
switch (tfJobType) { switch (tfJobType) {
case 'Created': case 'Created':
kubernetesTrialJob.status = 'WAITING'; kubernetesTrialJob.status = 'WAITING';
...@@ -63,7 +62,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector { ...@@ -63,7 +62,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector {
default: default:
} }
} }
// tslint:enable:no-any no-unsafe-any /* eslint-enable require-atomic-updates */
return Promise.resolve(); return Promise.resolve();
} }
......
...@@ -17,7 +17,6 @@ import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from ...@@ -17,7 +17,6 @@ import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
import { validateCodeDir } from '../../common/util'; import { validateCodeDir } from '../../common/util';
import { AzureStorageClientUtility } from '../azureStorageClientUtils';
import { NFSConfig } from '../kubernetesConfig'; import { NFSConfig } from '../kubernetesConfig';
import { KubernetesTrialJobDetail } from '../kubernetesData'; import { KubernetesTrialJobDetail } from '../kubernetesData';
import { KubernetesTrainingService } from '../kubernetesTrainingService'; import { KubernetesTrainingService } from '../kubernetesTrainingService';
...@@ -28,7 +27,6 @@ import { KubeflowClusterConfig, KubeflowClusterConfigAzure, KubeflowClusterConfi ...@@ -28,7 +27,6 @@ import { KubeflowClusterConfig, KubeflowClusterConfigAzure, KubeflowClusterConfi
import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector'; import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector';
import { KubeflowJobRestServer } from './kubeflowJobRestServer'; import { KubeflowJobRestServer } from './kubeflowJobRestServer';
// tslint:disable: no-unsafe-any no-any
/** /**
* Training Service implementation for Kubeflow * Training Service implementation for Kubeflow
* Refer https://github.com/kubeflow/kubeflow for more info about Kubeflow * Refer https://github.com/kubeflow/kubeflow for more info about Kubeflow
...@@ -109,14 +107,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -109,14 +107,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
return Promise.resolve(trialJobDetail); return Promise.resolve(trialJobDetail);
} }
// tslint:disable:no-redundant-jsdoc
public async setClusterMetadata(key: string, value: string): Promise<void> { public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) { switch (key) {
case TrialConfigMetadataKey.NNI_MANAGER_IP: case TrialConfigMetadataKey.NNI_MANAGER_IP:
this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value); this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value);
break; break;
case TrialConfigMetadataKey.KUBEFLOW_CLUSTER_CONFIG: case TrialConfigMetadataKey.KUBEFLOW_CLUSTER_CONFIG: {
const kubeflowClusterJsonObject: object = JSON.parse(value); const kubeflowClusterJsonObject: object = JSON.parse(value);
this.kubeflowClusterConfig = KubeflowClusterConfigFactory.generateKubeflowClusterConfig(kubeflowClusterJsonObject); this.kubeflowClusterConfig = KubeflowClusterConfigFactory.generateKubeflowClusterConfig(kubeflowClusterJsonObject);
if (this.kubeflowClusterConfig.storageType === 'azureStorage') { if (this.kubeflowClusterConfig.storageType === 'azureStorage') {
...@@ -125,9 +122,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -125,9 +122,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
this.azureStorageShare = azureKubeflowClusterConfig.azureStorage.azureShare; this.azureStorageShare = azureKubeflowClusterConfig.azureStorage.azureShare;
await this.createAzureStorage( await this.createAzureStorage(
azureKubeflowClusterConfig.keyVault.vaultName, azureKubeflowClusterConfig.keyVault.vaultName,
azureKubeflowClusterConfig.keyVault.name, azureKubeflowClusterConfig.keyVault.name
azureKubeflowClusterConfig.azureStorage.accountName,
azureKubeflowClusterConfig.azureStorage.azureShare
); );
} else if (this.kubeflowClusterConfig.storageType === 'nfs') { } else if (this.kubeflowClusterConfig.storageType === 'nfs') {
const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig; const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig;
...@@ -139,8 +134,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -139,8 +134,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
this.kubernetesCRDClient = KubeflowOperatorClientFactory.createClient( this.kubernetesCRDClient = KubeflowOperatorClientFactory.createClient(
this.kubeflowClusterConfig.operator, this.kubeflowClusterConfig.apiVersion); this.kubeflowClusterConfig.operator, this.kubeflowClusterConfig.apiVersion);
break; break;
}
case TrialConfigMetadataKey.TRIAL_CONFIG: case TrialConfigMetadataKey.TRIAL_CONFIG: {
if (this.kubeflowClusterConfig === undefined) { if (this.kubeflowClusterConfig === undefined) {
this.log.error('kubeflow cluster config is not initialized'); this.log.error('kubeflow cluster config is not initialized');
...@@ -163,6 +158,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -163,6 +158,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
return Promise.reject(new Error(error)); return Promise.reject(new Error(error));
} }
break; break;
}
case TrialConfigMetadataKey.VERSION_CHECK: case TrialConfigMetadataKey.VERSION_CHECK:
this.versionCheck = (value === 'true' || value === 'True'); this.versionCheck = (value === 'true' || value === 'True');
break; break;
...@@ -235,7 +231,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -235,7 +231,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
//create tmp trial working folder locally. //create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`); await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT; const runScriptContent: string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files // Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });
...@@ -293,14 +289,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -293,14 +289,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw Error(`operator ${this.kubeflowClusterConfig.operator} is invalid`); throw Error(`operator ${this.kubeflowClusterConfig.operator} is invalid`);
} }
const workerPodResources : any = {}; const workerPodResources: any = {};
if (kubeflowTrialConfig.worker !== undefined) { if (kubeflowTrialConfig.worker !== undefined) {
workerPodResources.requests = this.generatePodResource(kubeflowTrialConfig.worker.memoryMB, kubeflowTrialConfig.worker.cpuNum, workerPodResources.requests = this.generatePodResource(kubeflowTrialConfig.worker.memoryMB, kubeflowTrialConfig.worker.cpuNum,
kubeflowTrialConfig.worker.gpuNum); kubeflowTrialConfig.worker.gpuNum);
} }
workerPodResources.limits = {...workerPodResources.requests}; workerPodResources.limits = {...workerPodResources.requests};
const nonWorkerResources : any = {}; const nonWorkerResources: any = {};
if (this.kubeflowClusterConfig.operator === 'tf-operator') { if (this.kubeflowClusterConfig.operator === 'tf-operator') {
const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig; const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
if (tensorflowTrialConfig.ps !== undefined) { if (tensorflowTrialConfig.ps !== undefined) {
...@@ -330,8 +326,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -330,8 +326,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param workerPodResources worker pod template * @param workerPodResources worker pod template
* @param nonWorkerPodResources non-worker pod template, like ps or master * @param nonWorkerPodResources non-worker pod template, like ps or master
*/ */
private async generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName : string, workerPodResources : any, private async generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName: string, workerPodResources: any,
nonWorkerPodResources?: any) : Promise<any> { nonWorkerPodResources?: any): Promise<any> {
if (this.kubeflowClusterConfig === undefined) { if (this.kubeflowClusterConfig === undefined) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
...@@ -348,11 +344,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -348,11 +344,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
const replicaSpecsObjMap: Map<string, object> = new Map<string, object>(); const replicaSpecsObjMap: Map<string, object> = new Map<string, object>();
if (this.kubeflowTrialConfig.operatorType === 'tf-operator') { if (this.kubeflowTrialConfig.operatorType === 'tf-operator') {
const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig; const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
let privateRegistrySecretName = await this.createRegistrySecret(tensorflowTrialConfig.worker.privateRegistryAuthPath); const privateRegistrySecretName = await this.createRegistrySecret(tensorflowTrialConfig.worker.privateRegistryAuthPath);
replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas, replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas,
tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources, privateRegistrySecretName); tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources, privateRegistrySecretName);
if (tensorflowTrialConfig.ps !== undefined) { if (tensorflowTrialConfig.ps !== undefined) {
let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(tensorflowTrialConfig.ps.privateRegistryAuthPath); const privateRegistrySecretName: string | undefined = await this.createRegistrySecret(tensorflowTrialConfig.ps.privateRegistryAuthPath);
replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas, replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas,
tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources, privateRegistrySecretName); tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources, privateRegistrySecretName);
} }
...@@ -360,11 +356,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -360,11 +356,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
} else if (this.kubeflowTrialConfig.operatorType === 'pytorch-operator') { } else if (this.kubeflowTrialConfig.operatorType === 'pytorch-operator') {
const pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig; const pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
if (pytorchTrialConfig.worker !== undefined) { if (pytorchTrialConfig.worker !== undefined) {
let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(pytorchTrialConfig.worker.privateRegistryAuthPath); const privateRegistrySecretName: string | undefined = await this.createRegistrySecret(pytorchTrialConfig.worker.privateRegistryAuthPath);
replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas, replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas,
pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources, privateRegistrySecretName); pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources, privateRegistrySecretName);
} }
let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(pytorchTrialConfig.master.privateRegistryAuthPath); const privateRegistrySecretName: string | undefined = await this.createRegistrySecret(pytorchTrialConfig.master.privateRegistryAuthPath);
replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas, replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas,
pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources, privateRegistrySecretName); pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources, privateRegistrySecretName);
...@@ -448,7 +444,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -448,7 +444,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
resources: podResources resources: podResources
} }
]); ]);
let spec: any = { const spec: any = {
containers: containersSpecMap.get('containers'), containers: containersSpecMap.get('containers'),
restartPolicy: 'ExitCode', restartPolicy: 'ExitCode',
volumes: volumeSpecMap.get('nniVolumes') volumes: volumeSpecMap.get('nniVolumes')
...@@ -463,7 +459,6 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -463,7 +459,6 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
replicas: replicaNumber, replicas: replicaNumber,
template: { template: {
metadata: { metadata: {
// tslint:disable-next-line:no-null-keyword
creationTimestamp: null creationTimestamp: null
}, },
spec: spec spec: spec
...@@ -471,5 +466,4 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -471,5 +466,4 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
} }
} }
} }
// tslint:enable: no-unsafe-any no-any
export { KubeflowTrainingService }; export { KubeflowTrainingService };
...@@ -3,13 +3,13 @@ ...@@ -3,13 +3,13 @@
'use strict'; 'use strict';
// eslint-disable-next-line @typescript-eslint/camelcase
import { Client1_10, config } from 'kubernetes-client'; import { Client1_10, config } from 'kubernetes-client';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
/** /**
* Generict Kubernetes client, target version >= 1.9 * Generict Kubernetes client, target version >= 1.9
*/ */
// tslint:disable: no-any no-unsafe-any
class GeneralK8sClient { class GeneralK8sClient {
protected readonly client: any; protected readonly client: any;
protected readonly log: Logger = getLogger(); protected readonly log: Logger = getLogger();
...@@ -21,7 +21,7 @@ class GeneralK8sClient { ...@@ -21,7 +21,7 @@ class GeneralK8sClient {
public async createSecret(secretManifest: any): Promise<boolean> { public async createSecret(secretManifest: any): Promise<boolean> {
let result: Promise<boolean>; let result: Promise<boolean>;
const response : any = await this.client.api.v1.namespaces('default').secrets const response: any = await this.client.api.v1.namespaces('default').secrets
.post({body: secretManifest}); .post({body: secretManifest});
if (response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) { if (response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) {
result = Promise.resolve(true); result = Promise.resolve(true);
...@@ -73,7 +73,7 @@ abstract class KubernetesCRDClient { ...@@ -73,7 +73,7 @@ abstract class KubernetesCRDClient {
public async createKubernetesJob(jobManifest: any): Promise<boolean> { public async createKubernetesJob(jobManifest: any): Promise<boolean> {
let result: Promise<boolean>; let result: Promise<boolean>;
const response : any = await this.operator.post({body: jobManifest}); const response: any = await this.operator.post({body: jobManifest});
if (response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) { if (response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) {
result = Promise.resolve(true); result = Promise.resolve(true);
} else { } else {
...@@ -86,7 +86,7 @@ abstract class KubernetesCRDClient { ...@@ -86,7 +86,7 @@ abstract class KubernetesCRDClient {
//TODO : replace any //TODO : replace any
public async getKubernetesJob(kubeflowJobName: string): Promise<any> { public async getKubernetesJob(kubeflowJobName: string): Promise<any> {
let result: Promise<any>; let result: Promise<any>;
const response : any = await this.operator(kubeflowJobName) const response: any = await this.operator(kubeflowJobName)
.get(); .get();
if (response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) { if (response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) {
result = Promise.resolve(response.body); result = Promise.resolve(response.body);
...@@ -104,7 +104,7 @@ abstract class KubernetesCRDClient { ...@@ -104,7 +104,7 @@ abstract class KubernetesCRDClient {
.map((labelKey: string) => `${labelKey}=${labels.get(labelKey)}`) .map((labelKey: string) => `${labelKey}=${labels.get(labelKey)}`)
.join(','); .join(',');
try { try {
const deleteResult : any = await this.operator() const deleteResult: any = await this.operator()
.delete({ .delete({
qs: { qs: {
labelSelector: matchQuery, labelSelector: matchQuery,
......
...@@ -6,7 +6,6 @@ ...@@ -6,7 +6,6 @@
export type KubernetesStorageKind = 'nfs' | 'azureStorage'; export type KubernetesStorageKind = 'nfs' | 'azureStorage';
import { MethodNotImplementedError } from '../../common/errors'; import { MethodNotImplementedError } from '../../common/errors';
// tslint:disable: completed-docs function-name
export abstract class KubernetesClusterConfig { export abstract class KubernetesClusterConfig {
public readonly storage?: KubernetesStorageKind; public readonly storage?: KubernetesStorageKind;
public readonly apiVersion: string; public readonly apiVersion: string;
...@@ -91,7 +90,6 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig { ...@@ -91,7 +90,6 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
} }
} }
// tslint:disable-next-line:no-unnecessary-class
export class KubernetesClusterConfigFactory { export class KubernetesClusterConfigFactory {
public static generateKubernetesClusterConfig(jsonObject: object): KubernetesClusterConfig { public static generateKubernetesClusterConfig(jsonObject: object): KubernetesClusterConfig {
...@@ -113,11 +111,11 @@ export class KubernetesClusterConfigFactory { ...@@ -113,11 +111,11 @@ export class KubernetesClusterConfigFactory {
*/ */
export class NFSConfig { export class NFSConfig {
// IP Adress of NFS server // IP Adress of NFS server
public readonly server : string; public readonly server: string;
// exported NFS path on NFS server // exported NFS path on NFS server
public readonly path : string; public readonly path: string;
constructor(server : string, path : string) { constructor(server: string, path: string) {
this.server = server; this.server = server;
this.path = path; this.path = path;
} }
...@@ -129,11 +127,11 @@ export class NFSConfig { ...@@ -129,11 +127,11 @@ export class NFSConfig {
*/ */
export class KeyVaultConfig { export class KeyVaultConfig {
// The vault-name to specify vault // The vault-name to specify vault
public readonly vaultName : string; public readonly vaultName: string;
// The name to specify private key // The name to specify private key
public readonly name : string; public readonly name: string;
constructor(vaultName : string, name : string) { constructor(vaultName: string, name: string) {
this.vaultName = vaultName; this.vaultName = vaultName;
this.name = name; this.name = name;
} }
...@@ -144,11 +142,11 @@ export class KeyVaultConfig { ...@@ -144,11 +142,11 @@ export class KeyVaultConfig {
*/ */
export class AzureStorage { export class AzureStorage {
// The azure share to storage files // The azure share to storage files
public readonly azureShare : string; public readonly azureShare: string;
// The account name of sotrage service // The account name of sotrage service
public readonly accountName: string; public readonly accountName: string;
constructor(azureShare : string, accountName: string) { constructor(azureShare: string, accountName: string) {
this.azureShare = azureShare; this.azureShare = azureShare;
this.accountName = accountName; this.accountName = accountName;
} }
...@@ -171,12 +169,12 @@ export class KubernetesTrialConfigTemplate { ...@@ -171,12 +169,12 @@ export class KubernetesTrialConfigTemplate {
public readonly privateRegistryAuthPath?: string; public readonly privateRegistryAuthPath?: string;
// Trail command // Trail command
public readonly command : string; public readonly command: string;
// Required GPU number for trial job. The number should be in [0,100] // Required GPU number for trial job. The number should be in [0,100]
public readonly gpuNum : number; public readonly gpuNum: number;
constructor(command : string, gpuNum : number, constructor(command: string, gpuNum: number,
cpuNum: number, memoryMB: number, image: string, privateRegistryAuthPath?: string) { cpuNum: number, memoryMB: number, image: string, privateRegistryAuthPath?: string) {
this.command = command; this.command = command;
this.gpuNum = gpuNum; this.gpuNum = gpuNum;
......
...@@ -14,7 +14,7 @@ import { KubernetesTrialJobDetail } from './kubernetesData'; ...@@ -14,7 +14,7 @@ import { KubernetesTrialJobDetail } from './kubernetesData';
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally * Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
*/ */
export class KubernetesJobInfoCollector { export class KubernetesJobInfoCollector {
protected readonly trialJobsMap : Map<string, KubernetesTrialJobDetail>; protected readonly trialJobsMap: Map<string, KubernetesTrialJobDetail>;
protected readonly log: Logger = getLogger(); protected readonly log: Logger = getLogger();
protected readonly statusesNeedToCheck: TrialJobStatus[]; protected readonly statusesNeedToCheck: TrialJobStatus[];
...@@ -23,9 +23,9 @@ export class KubernetesJobInfoCollector { ...@@ -23,9 +23,9 @@ export class KubernetesJobInfoCollector {
this.statusesNeedToCheck = ['RUNNING', 'WAITING']; this.statusesNeedToCheck = ['RUNNING', 'WAITING'];
} }
public async retrieveTrialStatus(kubernetesCRDClient: KubernetesCRDClient | undefined) : Promise<void> { public async retrieveTrialStatus(kubernetesCRDClient: KubernetesCRDClient | undefined): Promise<void> {
assert(kubernetesCRDClient !== undefined); assert(kubernetesCRDClient !== undefined);
const updateKubernetesTrialJobs : Promise<void>[] = []; const updateKubernetesTrialJobs: Promise<void>[] = [];
for (const [trialJobId, kubernetesTrialJob] of this.trialJobsMap) { for (const [trialJobId, kubernetesTrialJob] of this.trialJobsMap) {
if (kubernetesTrialJob === undefined) { if (kubernetesTrialJob === undefined) {
throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`); throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`);
...@@ -41,7 +41,7 @@ export class KubernetesJobInfoCollector { ...@@ -41,7 +41,7 @@ export class KubernetesJobInfoCollector {
} }
protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined, protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> { kubernetesTrialJob: KubernetesTrialJobDetail): Promise<void> {
throw new MethodNotImplementedError(); throw new MethodNotImplementedError();
} }
} }
...@@ -25,8 +25,7 @@ export class KubernetesJobRestServer extends ClusterJobRestServer { ...@@ -25,8 +25,7 @@ export class KubernetesJobRestServer extends ClusterJobRestServer {
this.kubernetesTrainingService = kubernetesTrainingService; this.kubernetesTrainingService = kubernetesTrainingService;
} }
// tslint:disable-next-line:no-any protected handleTrialMetrics(jobId: string, metrics: any[]): void {
protected handleTrialMetrics(jobId : string, metrics : any[]) : void {
if (this.kubernetesTrainingService === undefined) { if (this.kubernetesTrainingService === undefined) {
throw Error('kubernetesTrainingService not initialized!'); throw Error('kubernetesTrainingService not initialized!');
} }
......
...@@ -22,8 +22,7 @@ import { KubernetesClusterConfig } from './kubernetesConfig'; ...@@ -22,8 +22,7 @@ import { KubernetesClusterConfig } from './kubernetesConfig';
import { kubernetesScriptFormat, KubernetesTrialJobDetail } from './kubernetesData'; import { kubernetesScriptFormat, KubernetesTrialJobDetail } from './kubernetesData';
import { KubernetesJobRestServer } from './kubernetesJobRestServer'; import { KubernetesJobRestServer } from './kubernetesJobRestServer';
var yaml = require('js-yaml'); const fs = require('fs');
var fs = require('fs');
/** /**
* Training Service implementation for Kubernetes * Training Service implementation for Kubernetes
...@@ -36,7 +35,7 @@ abstract class KubernetesTrainingService { ...@@ -36,7 +35,7 @@ abstract class KubernetesTrainingService {
// experiment root dir in NFS // experiment root dir in NFS
protected readonly trialLocalNFSTempFolder: string; protected readonly trialLocalNFSTempFolder: string;
protected stopping: boolean = false; protected stopping: boolean = false;
protected experimentId! : string; protected experimentId!: string;
protected kubernetesRestServerPort?: number; protected kubernetesRestServerPort?: number;
protected readonly CONTAINER_MOUNT_PATH: string; protected readonly CONTAINER_MOUNT_PATH: string;
protected azureStorageClient?: azureStorage.FileService; protected azureStorageClient?: azureStorage.FileService;
...@@ -62,14 +61,18 @@ abstract class KubernetesTrainingService { ...@@ -62,14 +61,18 @@ abstract class KubernetesTrainingService {
this.logCollection = 'none'; this.logCollection = 'none';
} }
// tslint:disable:no-any
public generatePodResource(memory: number, cpuNum: number, gpuNum: number): any { public generatePodResource(memory: number, cpuNum: number, gpuNum: number): any {
return { const resources: any = {
memory: `${memory}Mi`, memory: `${memory}Mi`,
cpu: `${cpuNum}`, cpu: `${cpuNum}`
'nvidia.com/gpu': `${gpuNum}`
}; };
} // tslint:enable:no-any
if (gpuNum !== 0) {
resources['nvidia.com/gpu'] = `${gpuNum}`;
}
return resources;
}
public async listTrialJobs(): Promise<TrialJobDetail[]> { public async listTrialJobs(): Promise<TrialJobDetail[]> {
const jobs: TrialJobDetail[] = []; const jobs: TrialJobDetail[] = [];
...@@ -108,12 +111,12 @@ abstract class KubernetesTrainingService { ...@@ -108,12 +111,12 @@ abstract class KubernetesTrainingService {
return Promise.resolve(''); return Promise.resolve('');
} }
public get MetricsEmitter() : EventEmitter { public get MetricsEmitter(): EventEmitter {
return this.metricsEmitter; return this.metricsEmitter;
} }
public async cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise<void> { public async cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise<void> {
const trialJobDetail : KubernetesTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); const trialJobDetail: KubernetesTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if (trialJobDetail === undefined) { if (trialJobDetail === undefined) {
const errorMessage: string = `CancelTrialJob: trial job id ${trialJobId} not found`; const errorMessage: string = `CancelTrialJob: trial job id ${trialJobId} not found`;
this.log.error(errorMessage); this.log.error(errorMessage);
...@@ -193,7 +196,6 @@ abstract class KubernetesTrainingService { ...@@ -193,7 +196,6 @@ abstract class KubernetesTrainingService {
await this.kubernetesJobRestServer.stop(); await this.kubernetesJobRestServer.stop();
this.log.info('Kubernetes Training service rest server stopped successfully.'); this.log.info('Kubernetes Training service rest server stopped successfully.');
} catch (error) { } catch (error) {
// tslint:disable-next-line: no-unsafe-any
this.log.error(`Kubernetes Training service rest server stopped failed, error: ${error.message}`); this.log.error(`Kubernetes Training service rest server stopped failed, error: ${error.message}`);
return Promise.reject(error); return Promise.reject(error);
...@@ -202,8 +204,7 @@ abstract class KubernetesTrainingService { ...@@ -202,8 +204,7 @@ abstract class KubernetesTrainingService {
return Promise.resolve(); return Promise.resolve();
} }
// tslint:disable: no-unsafe-any no-any protected async createAzureStorage(vaultName: string, valutKeyName: string): Promise<void> {
protected async createAzureStorage(vaultName: string, valutKeyName: string, accountName: string, azureShare: string): Promise<void> {
try { try {
const result: any = await cpp.exec(`az keyvault secret show --name ${valutKeyName} --vault-name ${vaultName}`); const result: any = await cpp.exec(`az keyvault secret show --name ${valutKeyName} --vault-name ${vaultName}`);
if (result.stderr) { if (result.stderr) {
...@@ -249,7 +250,6 @@ abstract class KubernetesTrainingService { ...@@ -249,7 +250,6 @@ abstract class KubernetesTrainingService {
return Promise.resolve(); return Promise.resolve();
} }
// tslint:enable: no-unsafe-any no-any
/** /**
* Genereate run script for different roles(like worker or ps) * Genereate run script for different roles(like worker or ps)
...@@ -265,9 +265,8 @@ abstract class KubernetesTrainingService { ...@@ -265,9 +265,8 @@ abstract class KubernetesTrainingService {
// Refer https://github.com/NVIDIA/k8s-device-plugin/issues/61 // Refer https://github.com/NVIDIA/k8s-device-plugin/issues/61
// So we have to explicitly set CUDA_VISIBLE_DEVICES to empty if user sets gpuNum to 0 in NNI config file // So we have to explicitly set CUDA_VISIBLE_DEVICES to empty if user sets gpuNum to 0 in NNI config file
if (gpuNum === 0) { if (gpuNum === 0) {
nvidiaScript = `export CUDA_VISIBLE_DEVICES='0'`; nvidiaScript = 'export CUDA_VISIBLE_DEVICES=';
} }
// tslint:disable-next-line: strict-boolean-expressions
const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address(); const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address();
const version: string = this.versionCheck ? await getVersion() : ''; const version: string = this.versionCheck ? await getVersion() : '';
const runScript: string = String.Format( const runScript: string = String.Format(
...@@ -307,8 +306,8 @@ abstract class KubernetesTrainingService { ...@@ -307,8 +306,8 @@ abstract class KubernetesTrainingService {
if(filePath === undefined || filePath === '') { if(filePath === undefined || filePath === '') {
return undefined; return undefined;
} }
let body = fs.readFileSync(filePath).toString('base64'); const body = fs.readFileSync(filePath).toString('base64');
let registrySecretName = String.Format('nni-secret-{0}', uniqueString(8) const registrySecretName = String.Format('nni-secret-{0}', uniqueString(8)
.toLowerCase()); .toLowerCase());
await this.genericK8sClient.createSecret( await this.genericK8sClient.createSecret(
{ {
...@@ -331,7 +330,7 @@ abstract class KubernetesTrainingService { ...@@ -331,7 +330,7 @@ abstract class KubernetesTrainingService {
return registrySecretName; return registrySecretName;
} }
protected async uploadFilesToAzureStorage(trialJobId: string, trialLocalTempFolder: String, codeDir: String, uploadRetryCount: number | undefined): Promise<string> { protected async uploadFilesToAzureStorage(trialJobId: string, trialLocalTempFolder: string, codeDir: string, uploadRetryCount: number | undefined): Promise<string> {
if (this.azureStorageClient === undefined) { if (this.azureStorageClient === undefined) {
throw new Error('azureStorageClient is not initialized'); throw new Error('azureStorageClient is not initialized');
} }
......
...@@ -4,11 +4,9 @@ ...@@ -4,11 +4,9 @@
'use strict'; 'use strict';
import * as cpp from 'child-process-promise'; import * as cpp from 'child-process-promise';
import * as cp from 'child_process';
import * as fs from 'fs'; import * as fs from 'fs';
import * as os from 'os'; import * as os from 'os';
import * as path from 'path'; import * as path from 'path';
import { String } from 'typescript-string-operations';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { delay } from '../../common/utils'; import { delay } from '../../common/utils';
import { GPUInfo, GPUSummary } from '../common/gpuData'; import { GPUInfo, GPUSummary } from '../common/gpuData';
...@@ -88,7 +86,6 @@ class GPUScheduler { ...@@ -88,7 +86,6 @@ class GPUScheduler {
runGpuMetricsCollector(this.gpuMetricCollectorScriptFolder); runGpuMetricsCollector(this.gpuMetricCollectorScriptFolder);
} }
// tslint:disable:non-literal-fs-path
private async updateGPUSummary(): Promise<void> { private async updateGPUSummary(): Promise<void> {
const gpuMetricPath: string = path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics'); const gpuMetricPath: string = path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics');
if (fs.existsSync(gpuMetricPath)) { if (fs.existsSync(gpuMetricPath)) {
......
...@@ -31,7 +31,6 @@ import { GPUScheduler } from './gpuScheduler'; ...@@ -31,7 +31,6 @@ import { GPUScheduler } from './gpuScheduler';
* success: true if the buffer contains at least one complete command; otherwise false * success: true if the buffer contains at least one complete command; otherwise false
* remain: remaining data after the first command * remain: remaining data after the first command
*/ */
// tslint:disable:newline-per-chained-call informative-docs
function decodeCommand(data: Buffer): [boolean, string, string, Buffer] { function decodeCommand(data: Buffer): [boolean, string, string, Buffer] {
if (data.length < 8) { if (data.length < 8) {
return [false, '', '', data]; return [false, '', '', data];
...@@ -46,7 +45,6 @@ function decodeCommand(data: Buffer): [boolean, string, string, Buffer] { ...@@ -46,7 +45,6 @@ function decodeCommand(data: Buffer): [boolean, string, string, Buffer] {
return [true, commandType, content, remain]; return [true, commandType, content, remain];
} }
// tslint:enable:newline-per-chained-call informative-docs
/** /**
* LocalTrialJobDetail * LocalTrialJobDetail
...@@ -107,7 +105,7 @@ class LocalTrainingService implements TrainingService { ...@@ -107,7 +105,7 @@ class LocalTrainingService implements TrainingService {
private initialized: boolean; private initialized: boolean;
private stopping: boolean; private stopping: boolean;
private rootDir!: string; private rootDir!: string;
private readonly experimentId! : string; private readonly experimentId!: string;
private gpuScheduler!: GPUScheduler; private gpuScheduler!: GPUScheduler;
private readonly occupiedGpuIndexNumMap: Map<number, number>; private readonly occupiedGpuIndexNumMap: Map<number, number>;
private designatedGpuIndices!: Set<number>; private designatedGpuIndices!: Set<number>;
...@@ -252,7 +250,6 @@ class LocalTrainingService implements TrainingService { ...@@ -252,7 +250,6 @@ class LocalTrainingService implements TrainingService {
public async setClusterMetadata(key: string, value: string): Promise<void> { public async setClusterMetadata(key: string, value: string): Promise<void> {
if (!this.initialized) { if (!this.initialized) {
this.rootDir = getExperimentRootDir(); this.rootDir = getExperimentRootDir();
// tslint:disable-next-line:non-literal-fs-path
if (!fs.existsSync(this.rootDir)) { if (!fs.existsSync(this.rootDir)) {
await cpp.exec(`powershell.exe mkdir ${this.rootDir}`); await cpp.exec(`powershell.exe mkdir ${this.rootDir}`);
} }
...@@ -299,7 +296,7 @@ class LocalTrainingService implements TrainingService { ...@@ -299,7 +296,7 @@ class LocalTrainingService implements TrainingService {
public getClusterMetadata(key: string): Promise<string> { public getClusterMetadata(key: string): Promise<string> {
switch (key) { switch (key) {
case TrialConfigMetadataKey.TRIAL_CONFIG: case TrialConfigMetadataKey.TRIAL_CONFIG: {
let getResult: Promise<string>; let getResult: Promise<string>;
if (this.localTrialConfig === undefined) { if (this.localTrialConfig === undefined) {
getResult = Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, `${key} is never set yet`)); getResult = Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, `${key} is never set yet`));
...@@ -308,6 +305,7 @@ class LocalTrainingService implements TrainingService { ...@@ -308,6 +305,7 @@ class LocalTrainingService implements TrainingService {
} }
return getResult; return getResult;
}
default: default:
return Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, 'Key not found')); return Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, 'Key not found'));
} }
...@@ -523,8 +521,8 @@ class LocalTrainingService implements TrainingService { ...@@ -523,8 +521,8 @@ class LocalTrainingService implements TrainingService {
await this.writeParameterFile(trialJobDetail.workingDirectory, trialJobDetail.form.hyperParameters); await this.writeParameterFile(trialJobDetail.workingDirectory, trialJobDetail.form.hyperParameters);
const trialJobProcess: cp.ChildProcess = runScript(path.join(trialJobDetail.workingDirectory, scriptName)); const trialJobProcess: cp.ChildProcess = runScript(path.join(trialJobDetail.workingDirectory, scriptName));
this.setTrialJobStatus(trialJobDetail, 'RUNNING'); this.setTrialJobStatus(trialJobDetail, 'RUNNING');
trialJobDetail.startTime = Date.now(); trialJobDetail.startTime = Date.now(); // eslint-disable-line require-atomic-updates
trialJobDetail.pid = trialJobProcess.pid; trialJobDetail.pid = trialJobProcess.pid; // eslint-disable-line require-atomic-updates
this.setExtraProperties(trialJobDetail, resource); this.setExtraProperties(trialJobDetail, resource);
let buffer: Buffer = Buffer.alloc(0); let buffer: Buffer = Buffer.alloc(0);
......
...@@ -17,7 +17,6 @@ export namespace HDFSClientUtility { ...@@ -17,7 +17,6 @@ export namespace HDFSClientUtility {
* @param hdfsUserName HDFS user name * @param hdfsUserName HDFS user name
*/ */
export function hdfsExpRootDir(hdfsUserName: string): string { export function hdfsExpRootDir(hdfsUserName: string): string {
// tslint:disable-next-line:prefer-template
return '/' + unixPathJoin(hdfsUserName, 'nni', 'experiments', getExperimentId()); return '/' + unixPathJoin(hdfsUserName, 'nni', 'experiments', getExperimentId());
} }
...@@ -47,11 +46,9 @@ export namespace HDFSClientUtility { ...@@ -47,11 +46,9 @@ export namespace HDFSClientUtility {
* @param hdfsFilePath hdfs file path(target) * @param hdfsFilePath hdfs file path(target)
* @param hdfsClient hdfs client * @param hdfsClient hdfs client
*/ */
// tslint:disable: no-unsafe-any non-literal-fs-path no-any export async function copyFileToHdfs(localFilePath: string, hdfsFilePath: string, hdfsClient: any): Promise<void> {
export async function copyFileToHdfs(localFilePath : string, hdfsFilePath : string, hdfsClient : any) : Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
// tslint:disable-next-line:non-literal-fs-path fs.exists(localFilePath, (exists: boolean) => {
fs.exists(localFilePath, (exists : boolean) => {
// Detect if local file exist // Detect if local file exist
if (exists) { if (exists) {
const localFileStream: fs.ReadStream = fs.createReadStream(localFilePath); const localFileStream: fs.ReadStream = fs.createReadStream(localFilePath);
...@@ -60,7 +57,7 @@ export namespace HDFSClientUtility { ...@@ -60,7 +57,7 @@ export namespace HDFSClientUtility {
hdfsFileStream.on('finish', () => { hdfsFileStream.on('finish', () => {
deferred.resolve(); deferred.resolve();
}); });
hdfsFileStream.on('error', (err : any) => { hdfsFileStream.on('error', (err: any) => {
getLogger() getLogger()
.error(`HDFSCientUtility:copyFileToHdfs, copy file failed, err is ${err.message}`); .error(`HDFSCientUtility:copyFileToHdfs, copy file failed, err is ${err.message}`);
deferred.reject(err); deferred.reject(err);
...@@ -82,7 +79,7 @@ export namespace HDFSClientUtility { ...@@ -82,7 +79,7 @@ export namespace HDFSClientUtility {
* @param hdfsDirectory HDFS directory * @param hdfsDirectory HDFS directory
* @param hdfsClient HDFS client * @param hdfsClient HDFS client
*/ */
export async function copyDirectoryToHdfs(localDirectory : string, hdfsDirectory : string, hdfsClient : any) : Promise<void> { export async function copyDirectoryToHdfs(localDirectory: string, hdfsDirectory: string, hdfsClient: any): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
// TODO: fs.readdirSync doesn't support ~($HOME) // TODO: fs.readdirSync doesn't support ~($HOME)
const fileNameArray: string[] = fs.readdirSync(localDirectory); const fileNameArray: string[] = fs.readdirSync(localDirectory);
...@@ -90,7 +87,6 @@ export namespace HDFSClientUtility { ...@@ -90,7 +87,6 @@ export namespace HDFSClientUtility {
for (const fileName of fileNameArray) { for (const fileName of fileNameArray) {
const fullFilePath: string = path.join(localDirectory, fileName); const fullFilePath: string = path.join(localDirectory, fileName);
try { try {
// tslint:disable-next-line:non-literal-fs-path
if (fs.lstatSync(fullFilePath) if (fs.lstatSync(fullFilePath)
.isFile()) { .isFile()) {
await copyFileToHdfs(fullFilePath, path.join(hdfsDirectory, fileName), hdfsClient); await copyFileToHdfs(fullFilePath, path.join(hdfsDirectory, fileName), hdfsClient);
...@@ -108,28 +104,51 @@ export namespace HDFSClientUtility { ...@@ -108,28 +104,51 @@ export namespace HDFSClientUtility {
return deferred.promise; return deferred.promise;
} }
/**
* Check if an HDFS path already exists
*
* @param hdfsPath target path need to check in HDFS
* @param hdfsClient HDFS client
*/
export async function pathExists(hdfsPath: string, hdfsClient: any): Promise<boolean> {
const deferred: Deferred<boolean> = new Deferred<boolean>();
hdfsClient.exists(hdfsPath, (exist: boolean) => {
deferred.resolve(exist);
});
let timeoutId: NodeJS.Timer;
const delayTimeout: Promise<boolean> = new Promise<boolean>((resolve: Function, reject: Function): void => {
// Set timeout and reject the promise once reach timeout (5 seconds)
timeoutId = setTimeout(() => { reject(`Check HDFS path ${hdfsPath} exists timeout`); }, 5000);
});
return Promise.race([deferred.promise, delayTimeout])
.finally(() => { clearTimeout(timeoutId); });
}
/** /**
* Read content from HDFS file * Read content from HDFS file
* *
* @param hdfsPath HDFS file path * @param hdfsPath HDFS file path
* @param hdfsClient HDFS client * @param hdfsClient HDFS client
*/ */
export async function readFileFromHDFS(hdfsPath : string, hdfsClient : any) : Promise<Buffer> { export async function readFileFromHDFS(hdfsPath: string, hdfsClient: any): Promise<Buffer> {
const deferred: Deferred<Buffer> = new Deferred<Buffer>(); const deferred: Deferred<Buffer> = new Deferred<Buffer>();
let buffer : Buffer = Buffer.alloc(0); let buffer: Buffer = Buffer.alloc(0);
const exist : boolean = await pathExists(hdfsPath, hdfsClient); const exist: boolean = await pathExists(hdfsPath, hdfsClient);
if (!exist) { if (!exist) {
deferred.reject(`${hdfsPath} doesn't exists`); deferred.reject(`${hdfsPath} doesn't exists`);
} }
const remoteFileStream: any = hdfsClient.createReadStream(hdfsPath); const remoteFileStream: any = hdfsClient.createReadStream(hdfsPath);
remoteFileStream.on('error', (err : any) => { remoteFileStream.on('error', (err: any) => {
// Reject with the error // Reject with the error
deferred.reject(err); deferred.reject(err);
}); });
remoteFileStream.on('data', (chunk : any) => { remoteFileStream.on('data', (chunk: any) => {
// Concat the data chunk to buffer // Concat the data chunk to buffer
buffer = Buffer.concat([buffer, chunk]); buffer = Buffer.concat([buffer, chunk]);
}); });
...@@ -142,39 +161,16 @@ export namespace HDFSClientUtility { ...@@ -142,39 +161,16 @@ export namespace HDFSClientUtility {
return deferred.promise; return deferred.promise;
} }
/**
* Check if an HDFS path already exists
*
* @param hdfsPath target path need to check in HDFS
* @param hdfsClient HDFS client
*/
export async function pathExists(hdfsPath : string, hdfsClient : any) : Promise<boolean> {
const deferred : Deferred<boolean> = new Deferred<boolean>();
hdfsClient.exists(hdfsPath, (exist : boolean) => {
deferred.resolve(exist);
});
let timeoutId : NodeJS.Timer;
const delayTimeout : Promise<boolean> = new Promise<boolean>((resolve : Function, reject : Function) : void => {
// Set timeout and reject the promise once reach timeout (5 seconds)
timeoutId = setTimeout(() => { reject(`Check HDFS path ${hdfsPath} exists timeout`); }, 5000);
});
return Promise.race([deferred.promise, delayTimeout])
.finally(() => { clearTimeout(timeoutId); });
}
/** /**
* Mkdir in HDFS, use default permission 755 * Mkdir in HDFS, use default permission 755
* *
* @param hdfsPath the path in HDFS. It could be either file or directory * @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient HDFS client * @param hdfsClient HDFS client
*/ */
export function mkdir(hdfsPath : string, hdfsClient : any) : Promise<boolean> { export function mkdir(hdfsPath: string, hdfsClient: any): Promise<boolean> {
const deferred : Deferred<boolean> = new Deferred<boolean>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
hdfsClient.mkdir(hdfsPath, (err : any) => { hdfsClient.mkdir(hdfsPath, (err: any) => {
if (!err) { if (!err) {
deferred.resolve(true); deferred.resolve(true);
} else { } else {
...@@ -191,14 +187,14 @@ export namespace HDFSClientUtility { ...@@ -191,14 +187,14 @@ export namespace HDFSClientUtility {
* @param hdfsPath the path in HDFS. It could be either file or directory * @param hdfsPath the path in HDFS. It could be either file or directory
* @param hdfsClient HDFS client * @param hdfsClient HDFS client
*/ */
export async function readdir(hdfsPath : string, hdfsClient : any) : Promise<string[]> { export async function readdir(hdfsPath: string, hdfsClient: any): Promise<string[]> {
const deferred : Deferred<string[]> = new Deferred<string[]>(); const deferred: Deferred<string[]> = new Deferred<string[]>();
const exist : boolean = await pathExists(hdfsPath, hdfsClient); const exist: boolean = await pathExists(hdfsPath, hdfsClient);
if (!exist) { if (!exist) {
deferred.reject(`${hdfsPath} doesn't exists`); deferred.reject(`${hdfsPath} doesn't exists`);
} }
hdfsClient.readdir(hdfsPath, (err : any, files : any[]) => { hdfsClient.readdir(hdfsPath, (err: any, files: any[]) => {
if (err) { if (err) {
deferred.reject(err); deferred.reject(err);
} }
...@@ -215,9 +211,9 @@ export namespace HDFSClientUtility { ...@@ -215,9 +211,9 @@ export namespace HDFSClientUtility {
* @param hdfsClient HDFS client * @param hdfsClient HDFS client
* @param recursive Mark if need to delete recursively * @param recursive Mark if need to delete recursively
*/ */
export function deletePath(hdfsPath : string, hdfsClient : any, recursive : boolean = true) : Promise<boolean> { export function deletePath(hdfsPath: string, hdfsClient: any, recursive: boolean = true): Promise<boolean> {
const deferred : Deferred<boolean> = new Deferred<boolean>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
hdfsClient.unlink(hdfsPath, recursive, (err : any) => { hdfsClient.unlink(hdfsPath, recursive, (err: any) => {
if (!err) { if (!err) {
deferred.resolve(true); deferred.resolve(true);
} else { } else {
...@@ -227,5 +223,4 @@ export namespace HDFSClientUtility { ...@@ -227,5 +223,4 @@ export namespace HDFSClientUtility {
return deferred.promise; return deferred.promise;
} }
// tslint:enable: no-unsafe-any non-literal-fs-path no-any
} }
...@@ -24,7 +24,7 @@ export class PAITaskRole { ...@@ -24,7 +24,7 @@ export class PAITaskRole {
//Shared memory for one task in the task role //Shared memory for one task in the task role
public readonly shmMB?: number; public readonly shmMB?: number;
//portList to specify the port used in container //portList to specify the port used in container
public portList?: portListMetaData[]; public portList?: PortListMetaData[];
/** /**
* Constructor * Constructor
...@@ -35,8 +35,8 @@ export class PAITaskRole { ...@@ -35,8 +35,8 @@ export class PAITaskRole {
* @param gpuNumber GPU number for one task in the task role, no less than 0 * @param gpuNumber GPU number for one task in the task role, no less than 0
* @param command Executable command for tasks in the task role, can not be empty * @param command Executable command for tasks in the task role, can not be empty
*/ */
constructor(name : string, taskNumber : number, cpuNumber : number, memoryMB : number, gpuNumber : number, constructor(name: string, taskNumber: number, cpuNumber: number, memoryMB: number, gpuNumber: number,
command : string, shmMB?: number, portList?: portListMetaData[]) { command: string, shmMB?: number, portList?: PortListMetaData[]) {
this.name = name; this.name = name;
this.taskNumber = taskNumber; this.taskNumber = taskNumber;
this.cpuNumber = cpuNumber; this.cpuNumber = cpuNumber;
...@@ -75,8 +75,8 @@ export class PAIJobConfig { ...@@ -75,8 +75,8 @@ export class PAIJobConfig {
* @param outputDir Output directory on HDFS * @param outputDir Output directory on HDFS
* @param taskRoles List of taskRole, one task role at least * @param taskRoles List of taskRole, one task role at least
*/ */
constructor(jobName: string, image : string, codeDir : string, constructor(jobName: string, image: string, codeDir: string,
taskRoles : PAITaskRole[], virtualCluster: string, authFile?: string) { taskRoles: PAITaskRole[], virtualCluster: string, authFile?: string) {
this.jobName = jobName; this.jobName = jobName;
this.image = image; this.image = image;
this.codeDir = codeDir; this.codeDir = codeDir;
...@@ -102,7 +102,7 @@ export class PAIClusterConfig { ...@@ -102,7 +102,7 @@ export class PAIClusterConfig {
* @param host Host IP of PAI Cluster * @param host Host IP of PAI Cluster
* @param token PAI token of PAI Cluster * @param token PAI token of PAI Cluster
*/ */
constructor(userName: string, host : string, passWord?: string, token?: string) { constructor(userName: string, host: string, passWord?: string, token?: string) {
this.userName = userName; this.userName = userName;
this.passWord = passWord; this.passWord = passWord;
this.host = host; this.host = host;
...@@ -113,8 +113,8 @@ export class PAIClusterConfig { ...@@ -113,8 +113,8 @@ export class PAIClusterConfig {
/** /**
* portList data structure used in PAI taskRole * portList data structure used in PAI taskRole
*/ */
export class portListMetaData { export class PortListMetaData {
public readonly label : string = ''; public readonly label: string = '';
public readonly beginAt: number = 0; public readonly beginAt: number = 0;
public readonly portNumber: number = 0; public readonly portNumber: number = 0;
} }
...@@ -135,10 +135,10 @@ export class NNIPAITrialConfig extends TrialConfig { ...@@ -135,10 +135,10 @@ export class NNIPAITrialConfig extends TrialConfig {
//authentication file used for private Docker registry //authentication file used for private Docker registry
public authFile?: string; public authFile?: string;
//portList to specify the port used in container //portList to specify the port used in container
public portList?: portListMetaData[]; public portList?: PortListMetaData[];
constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, constructor(command: string, codeDir: string, gpuNum: number, cpuNum: number, memoryMB: number,
image: string, virtualCluster?: string, shmMB?: number, authFile?: string, portList?: portListMetaData[]) { image: string, virtualCluster?: string, shmMB?: number, authFile?: string, portList?: PortListMetaData[]) {
super(command, codeDir, gpuNum); super(command, codeDir, gpuNum);
this.cpuNum = cpuNum; this.cpuNum = cpuNum;
this.memoryMB = memoryMB; this.memoryMB = memoryMB;
......
...@@ -22,7 +22,7 @@ export class PAITrialJobDetail implements TrialJobDetail { ...@@ -22,7 +22,7 @@ export class PAITrialJobDetail implements TrialJobDetail {
public hdfsLogPath: string; public hdfsLogPath: string;
public isEarlyStopped?: boolean; public isEarlyStopped?: boolean;
constructor(id: string, status: TrialJobStatus, paiJobName : string, constructor(id: string, status: TrialJobStatus, paiJobName: string,
submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, hdfsLogPath: string) { submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, hdfsLogPath: string) {
this.id = id; this.id = id;
this.status = status; this.status = status;
...@@ -52,6 +52,5 @@ export const PAI_TRIAL_COMMAND_FORMAT: string = ...@@ -52,6 +52,5 @@ export const PAI_TRIAL_COMMAND_FORMAT: string =
--pai_hdfs_output_dir '{9}' --pai_hdfs_host '{10}' --pai_user_name {11} --nni_hdfs_exp_dir '{12}' --webhdfs_path '/webhdfs/api/v1' \ --pai_hdfs_output_dir '{9}' --pai_hdfs_host '{10}' --pai_user_name {11} --nni_hdfs_exp_dir '{12}' --webhdfs_path '/webhdfs/api/v1' \
--nni_manager_version '{13}' --log_collection '{14}'`; --nni_manager_version '{13}' --log_collection '{14}'`;
// tslint:disable:no-http-string
export const PAI_LOG_PATH_FORMAT: string = export const PAI_LOG_PATH_FORMAT: string =
`http://{0}/webhdfs/explorer.html#{1}`; `http://{0}/webhdfs/explorer.html#{1}`;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment