"test/ut/sdk/git@developer.sourcefind.cn:OpenDAS/nni.git" did not exist on "f60d3d5e294510d99c65ba3292822cbb922adbf8"
Commit ba8dccd6 authored by suiguoxin's avatar suiguoxin
Browse files

Merge branch 'master' of https://github.com/microsoft/nni

parents 56a1575b 150ee83a
...@@ -59,17 +59,17 @@ export class GPUSummary { ...@@ -59,17 +59,17 @@ export class GPUSummary {
} }
} }
export const GPU_INFO_COLLECTOR_FORMAT_LINUX: string = export const GPU_INFO_COLLECTOR_FORMAT_LINUX: string =
` `
#!/bin/bash #!/bin/bash
export METRIC_OUTPUT_DIR={0} export METRIC_OUTPUT_DIR={0}
echo $$ >{1} echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector python3 -m nni_gpu_tool.gpu_metrics_collector
` `;
export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string = export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string =
` `
$env:METRIC_OUTPUT_DIR="{0}" $env:METRIC_OUTPUT_DIR="{0}"
$app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow $app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow
Write $app.ID | Out-File {1} -NoNewline -encoding utf8 Write $app.ID | Out-File {1} -NoNewline -encoding utf8
` `;
\ No newline at end of file
...@@ -21,7 +21,10 @@ ...@@ -21,7 +21,10 @@
import { TrialJobStatus } from '../../common/trainingService'; import { TrialJobStatus } from '../../common/trainingService';
// tslint:disable-next-line:max-classes-per-file /**
* Trial job metrics class
* Representing trial job metrics properties
*/
export class JobMetrics { export class JobMetrics {
public readonly jobId: string; public readonly jobId: string;
public readonly metrics: string[]; public readonly metrics: string[];
......
...@@ -24,13 +24,13 @@ ...@@ -24,13 +24,13 @@
* Representing trial job configurable properties * Representing trial job configurable properties
*/ */
export class TrialConfig { export class TrialConfig {
/** Trail command */ // Trail command
public readonly command : string; public readonly command : string;
/** Code directory */ // Code directory
public readonly codeDir : string; public readonly codeDir : string;
/** Required GPU number for trial job. The number should be in [0,100] */ // Required GPU number for trial job. The number should be in [0,100]
public readonly gpuNum : number; public readonly gpuNum : number;
/** /**
...@@ -44,4 +44,4 @@ export class TrialConfig { ...@@ -44,4 +44,4 @@ export class TrialConfig {
this.codeDir = codeDir; this.codeDir = codeDir;
this.gpuNum = gpuNum; this.gpuNum = gpuNum;
} }
} }
\ No newline at end of file
import { getLogger } from "common/log";
/** /**
* Copyright (c) Microsoft Corporation * Copyright (c) Microsoft Corporation
* All rights reserved. * All rights reserved.
...@@ -21,44 +19,55 @@ import { getLogger } from "common/log"; ...@@ -21,44 +19,55 @@ import { getLogger } from "common/log";
'use strict'; 'use strict';
import { countFilesRecursively } from '../../common/utils'
import * as cpp from 'child-process-promise'; import * as cpp from 'child-process-promise';
import * as cp from 'child_process'; import * as cp from 'child_process';
import * as os from 'os';
import * as fs from 'fs'; import * as fs from 'fs';
import { getNewLine } from '../../common/utils'; import * as os from 'os';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData';
import * as path from 'path'; import * as path from 'path';
import { String } from 'typescript-string-operations'; import { String } from 'typescript-string-operations';
import { file } from "../../node_modules/@types/tmp"; import { countFilesRecursively, getNewLine, validateFileNameRecursively } from '../../common/utils';
import { file } from '../../node_modules/@types/tmp';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData';
/** /**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken * Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
* *
* @param codeDir codeDir in nni config file * @param codeDir codeDir in nni config file
* @returns file number under codeDir * @returns file number under codeDir
*/ */
// tslint:disable: no-redundant-jsdoc
export async function validateCodeDir(codeDir: string) : Promise<number> { export async function validateCodeDir(codeDir: string) : Promise<number> {
let fileCount: number | undefined; let fileCount: number | undefined;
let fileNameValid: boolean = true;
try { try {
fileCount = await countFilesRecursively(codeDir); fileCount = await countFilesRecursively(codeDir);
} catch(error) { } catch (error) {
throw new Error(`Call count file error: ${error}`); throw new Error(`Call count file error: ${error}`);
} }
try {
fileNameValid = await validateFileNameRecursively(codeDir);
} catch(error) {
throw new Error(`Validate file name error: ${error}`);
}
if(fileCount && fileCount > 1000) { if (fileCount !== undefined && fileCount > 1000) {
const errMessage: string = `Too many files(${fileCount} found}) in ${codeDir},` const errMessage: string = `Too many files(${fileCount} found}) in ${codeDir},`
+ ` please check if it's a valid code dir`; + ` please check if it's a valid code dir`;
throw new Error(errMessage); throw new Error(errMessage);
}
if(!fileNameValid) {
const errMessage: string = `File name in ${codeDir} is not valid, please check file names, only support digit number、alphabet and (.-_) in file name.`;
throw new Error(errMessage);
} }
return fileCount; return fileCount;
} }
/** /**
* crete a new directory * crete a new directory
* @param directory * @param directory
*/ */
export async function execMkdir(directory: string): Promise<void> { export async function execMkdir(directory: string): Promise<void> {
if (process.platform === 'win32') { if (process.platform === 'win32') {
...@@ -66,6 +75,7 @@ export async function execMkdir(directory: string): Promise<void> { ...@@ -66,6 +75,7 @@ export async function execMkdir(directory: string): Promise<void> {
} else { } else {
await cpp.exec(`mkdir -p ${directory}`); await cpp.exec(`mkdir -p ${directory}`);
} }
return Promise.resolve(); return Promise.resolve();
} }
...@@ -80,12 +90,13 @@ export async function execCopydir(source: string, destination: string): Promise< ...@@ -80,12 +90,13 @@ export async function execCopydir(source: string, destination: string): Promise<
} else { } else {
await cpp.exec(`cp -r ${source} ${destination}`); await cpp.exec(`cp -r ${source} ${destination}`);
} }
return Promise.resolve(); return Promise.resolve();
} }
/** /**
* crete a new file * crete a new file
* @param filename * @param filename
*/ */
export async function execNewFile(filename: string): Promise<void> { export async function execNewFile(filename: string): Promise<void> {
if (process.platform === 'win32') { if (process.platform === 'win32') {
...@@ -93,16 +104,17 @@ export async function execNewFile(filename: string): Promise<void> { ...@@ -93,16 +104,17 @@ export async function execNewFile(filename: string): Promise<void> {
} else { } else {
await cpp.exec(`touch ${filename}`); await cpp.exec(`touch ${filename}`);
} }
return Promise.resolve(); return Promise.resolve();
} }
/** /**
* run script * run script using powershell or bash
* @param filePath * @param filePath
*/ */
export function execScript(filePath: string): cp.ChildProcess { export function runScript(filePath: string): cp.ChildProcess {
if (process.platform === 'win32') { if (process.platform === 'win32') {
return cp.exec(`powershell.exe -file ${filePath}`); return cp.exec(`powershell.exe -ExecutionPolicy Bypass -file ${filePath}`);
} else { } else {
return cp.exec(`bash ${filePath}`); return cp.exec(`bash ${filePath}`);
} }
...@@ -110,7 +122,7 @@ export function execScript(filePath: string): cp.ChildProcess { ...@@ -110,7 +122,7 @@ export function execScript(filePath: string): cp.ChildProcess {
/** /**
* output the last line of a file * output the last line of a file
* @param filePath * @param filePath
*/ */
export async function execTail(filePath: string): Promise<cpp.childProcessPromise.Result> { export async function execTail(filePath: string): Promise<cpp.childProcessPromise.Result> {
let cmdresult: cpp.childProcessPromise.Result; let cmdresult: cpp.childProcessPromise.Result;
...@@ -119,12 +131,13 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis ...@@ -119,12 +131,13 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis
} else { } else {
cmdresult = await cpp.exec(`tail -n 1 ${filePath}`); cmdresult = await cpp.exec(`tail -n 1 ${filePath}`);
} }
return Promise.resolve(cmdresult); return Promise.resolve(cmdresult);
} }
/** /**
* delete a directory * delete a directory
* @param directory * @param directory
*/ */
export async function execRemove(directory: string): Promise<void> { export async function execRemove(directory: string): Promise<void> {
if (process.platform === 'win32') { if (process.platform === 'win32') {
...@@ -132,12 +145,13 @@ export async function execRemove(directory: string): Promise<void> { ...@@ -132,12 +145,13 @@ export async function execRemove(directory: string): Promise<void> {
} else { } else {
await cpp.exec(`rm -rf ${directory}`); await cpp.exec(`rm -rf ${directory}`);
} }
return Promise.resolve(); return Promise.resolve();
} }
/** /**
* kill a process * kill a process
* @param directory * @param directory
*/ */
export async function execKill(pid: string): Promise<void> { export async function execKill(pid: string): Promise<void> {
if (process.platform === 'win32') { if (process.platform === 'win32') {
...@@ -145,37 +159,39 @@ export async function execKill(pid: string): Promise<void> { ...@@ -145,37 +159,39 @@ export async function execKill(pid: string): Promise<void> {
} else { } else {
await cpp.exec(`pkill -P ${pid}`); await cpp.exec(`pkill -P ${pid}`);
} }
return Promise.resolve(); return Promise.resolve();
} }
/** /**
* set environment variable * get command of setting environment variable
* @param variable * @param variable
* @returns command string * @returns command string
*/ */
export function setEnvironmentVariable(variable: { key: string; value: string }): string { export function setEnvironmentVariable(variable: { key: string; value: string }): string {
if (process.platform === 'win32') { if (process.platform === 'win32') {
return `$env:${variable.key}="${variable.value}"`; return `$env:${variable.key}="${variable.value}"`;
} } else {
else{
return `export ${variable.key}=${variable.value}`; return `export ${variable.key}=${variable.value}`;
} }
} }
/** /**
* Compress files in directory to tar file * Compress files in directory to tar file
* @param source_path * @param sourcePath
* @param tar_path * @param tarPath
*/ */
export async function tarAdd(tar_path: string, source_path: string): Promise<void> { export async function tarAdd(tarPath: string, sourcePath: string): Promise<void> {
if (process.platform === 'win32') { if (process.platform === 'win32') {
tar_path = tar_path.split('\\').join('\\\\'); const tarFilePath: string = tarPath.split('\\')
source_path = source_path.split('\\').join('\\\\'); .join('\\\\');
let script: string[] = []; const sourceFilePath: string = sourcePath.split('\\')
.join('\\\\');
const script: string[] = [];
script.push( script.push(
`import os`, `import os`,
`import tarfile`, `import tarfile`,
String.Format(`tar = tarfile.open("{0}","w:gz")\r\nfor root,dir,files in os.walk("{1}"):`, tar_path, source_path), String.Format(`tar = tarfile.open("{0}","w:gz")\r\nfor root,dir,files in os.walk("{1}"):`, tarFilePath, sourceFilePath),
` for file in files:`, ` for file in files:`,
` fullpath = os.path.join(root,file)`, ` fullpath = os.path.join(root,file)`,
` tar.add(fullpath, arcname=file)`, ` tar.add(fullpath, arcname=file)`,
...@@ -184,39 +200,40 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi ...@@ -184,39 +200,40 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi
const tarScript: string = path.join(os.tmpdir(), 'tar.py'); const tarScript: string = path.join(os.tmpdir(), 'tar.py');
await cpp.exec(`python ${tarScript}`); await cpp.exec(`python ${tarScript}`);
} else { } else {
await cpp.exec(`tar -czf ${tar_path} -C ${source_path} .`); await cpp.exec(`tar -czf ${tarPath} -C ${sourcePath} .`);
} }
return Promise.resolve(); return Promise.resolve();
} }
/** /**
* generate script file name * generate script file name
* @param fileNamePrefix * @param fileNamePrefix
*/ */
export function getScriptName(fileNamePrefix: string): string { export function getScriptName(fileNamePrefix: string): string {
if (process.platform === 'win32') { if (process.platform === 'win32') {
return fileNamePrefix + '.ps1'; return String.Format('{0}.ps1', fileNamePrefix);
} else { } else {
return fileNamePrefix + '.sh'; return String.Format('{0}.sh', fileNamePrefix);
} }
} }
/** /**
* generate script file * generate script file
* @param gpuMetricCollectorScriptFolder * @param gpuMetricCollectorScriptFolder
*/ */
export function getgpuMetricsCollectorScriptContent(gpuMetricCollectorScriptFolder: string): string { export function getgpuMetricsCollectorScriptContent(gpuMetricCollectorScriptFolder: string): string {
if(process.platform === 'win32') { if (process.platform === 'win32') {
return String.Format( return String.Format(
GPU_INFO_COLLECTOR_FORMAT_WINDOWS, GPU_INFO_COLLECTOR_FORMAT_WINDOWS,
gpuMetricCollectorScriptFolder, gpuMetricCollectorScriptFolder,
path.join(gpuMetricCollectorScriptFolder, 'pid'), path.join(gpuMetricCollectorScriptFolder, 'pid')
); );
} else { } else {
return String.Format( return String.Format(
GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_LINUX,
gpuMetricCollectorScriptFolder, gpuMetricCollectorScriptFolder,
path.join(gpuMetricCollectorScriptFolder, 'pid'), path.join(gpuMetricCollectorScriptFolder, 'pid')
); );
} }
} }
...@@ -19,108 +19,126 @@ ...@@ -19,108 +19,126 @@
'use strict'; 'use strict';
import * as fs from 'fs' import * as azureStorage from 'azure-storage';
import * as fs from 'fs';
import * as path from 'path'; import * as path from 'path';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import { String } from 'typescript-string-operations';
import { getLogger } from '../../common/log'; import { getLogger } from '../../common/log';
import { mkDirP } from '../../common/utils'; import { mkDirP } from '../../common/utils';
// tslint:disable: no-redundant-jsdoc no-any no-unsafe-any
export namespace AzureStorageClientUtility { export namespace AzureStorageClientUtility {
/** /**
* create azure share * create azure share
* @param fileServerClient * @param fileServerClient
* @param azureShare * @param azureShare
*/ */
export async function createShare(fileServerClient: any, azureShare: any): Promise<void>{ export async function createShare(fileServerClient: any, azureShare: any): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
fileServerClient.createShareIfNotExists(azureShare, function(error: any, result: any, response: any) { fileServerClient.createShareIfNotExists(azureShare, (error: any, result: any, response: any) => {
if(error){ if (error) {
getLogger().error(`Create share failed:, ${error}`); getLogger()
deferred.reject(error) .error(`Create share failed:, ${error}`);
}else{ deferred.reject(error);
deferred.resolve() } else {
deferred.resolve();
} }
}) });
return deferred.promise; return deferred.promise;
} }
/** /**
* Create a new directory (NOT recursively) in azure file storage. * Create a new directory (NOT recursively) in azure file storage.
* @param fileServerClient * @param fileServerClient
* @param azureFoler * @param azureFoler
* @param azureShare * @param azureShare
*/ */
export async function createDirectory(fileServerClient: any, azureFoler: any, azureShare: any): Promise<void>{ export async function createDirectory(fileServerClient: azureStorage.FileService, azureFoler: any, azureShare: any): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
fileServerClient.createDirectoryIfNotExists(azureShare, azureFoler, function(error: any, result: any, response: any) { fileServerClient.createDirectoryIfNotExists(azureShare, azureFoler, (error: any, result: any, response: any) => {
if(error){ if (error) {
getLogger().error(`Create directory failed:, ${error}`); getLogger()
.error(`Create directory failed:, ${error}`);
deferred.reject(error); deferred.reject(error);
}else{ } else {
deferred.resolve(); deferred.resolve();
} }
}) });
return deferred.promise; return deferred.promise;
} }
/** /**
* Create a new directory recursively in azure file storage * Create a new directory recursively in azure file storage
* @param fileServerClient * @param fileServerClient
* @param azureDirectory * @param azureDirectory
*/ */
export async function createDirectoryRecursive(fileServerClient: any, azureDirectory: any, azureShare: any): Promise<void>{ export async function createDirectoryRecursive(fileServerClient: azureStorage.FileService, azureDirectory: string,
azureShare: any): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
let directories = azureDirectory.split("/"); const directories: string[] = azureDirectory.split('/');
let rootDirectory = "" let rootDirectory: string = '';
for(let directory of directories){ for (const directory of directories) {
rootDirectory += directory; rootDirectory += directory;
await createDirectory(fileServerClient, rootDirectory, azureShare); await createDirectory(fileServerClient, rootDirectory, azureShare);
rootDirectory += '/'; rootDirectory += '/';
} }
deferred.resolve(); deferred.resolve();
return deferred.promise; return deferred.promise;
} }
/** /**
* upload a file to azure storage * upload a file to azure storage
* @param fileServerClient * @param fileServerClient
* @param azureDirectory * @param azureDirectory
* @param azureFileName * @param azureFileName
* @param azureShare * @param azureShare
* @param localFilePath * @param localFilePath
*/ */
async function uploadFileToAzure(fileServerClient: any, azureDirectory: any, azureFileName: any, azureShare: any, localFilePath: any): Promise<void>{ async function uploadFileToAzure(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any,
localFilePath: string): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
await fileServerClient.createFileFromLocalFile(azureShare, azureDirectory, azureFileName, localFilePath, function(error: any, result: any, response: any) { await fileServerClient.createFileFromLocalFile(azureShare, azureDirectory, azureFileName, localFilePath,
if(error){ (error: any, result: any, response: any) => {
getLogger().error(`Upload file failed:, ${error}`); if (error) {
getLogger()
.error(`Upload file failed:, ${error}`);
deferred.reject(error); deferred.reject(error);
}else{ } else {
deferred.resolve(); deferred.resolve();
} }
}) });
return deferred.promise; return deferred.promise;
} }
/** /**
* download a file from azure storage * download a file from azure storage
* @param fileServerClient * @param fileServerClient
* @param azureDirectory * @param azureDirectory
* @param azureFileName * @param azureFileName
* @param azureShare * @param azureShare
* @param localFilePath * @param localFilePath
*/ */
async function downloadFile(fileServerClient: any, azureDirectory: any, azureFileName: any, azureShare: any, localFilePath: any): Promise<void>{ async function downloadFile(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any,
localFilePath: string): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
await fileServerClient.getFileToStream(azureShare, azureDirectory, azureFileName, fs.createWriteStream(localFilePath), function(error: any, result: any, response: any) { // tslint:disable-next-line:non-literal-fs-path
if(error){ await fileServerClient.getFileToStream(azureShare, azureDirectory, azureFileName, fs.createWriteStream(localFilePath),
getLogger().error(`Download file failed:, ${error}`); (error: any, result: any, response: any) => {
if (error) {
getLogger()
.error(`Download file failed:, ${error}`);
deferred.reject(error); deferred.reject(error);
}else{ } else {
deferred.resolve(); deferred.resolve();
} }
}) });
return deferred.promise; return deferred.promise;
} }
...@@ -131,67 +149,79 @@ export namespace AzureStorageClientUtility { ...@@ -131,67 +149,79 @@ export namespace AzureStorageClientUtility {
* @param azureShare : the azure share used * @param azureShare : the azure share used
* @param localDirectory : local directory to be uploaded * @param localDirectory : local directory to be uploaded
*/ */
export async function uploadDirectory(fileServerClient: any, azureDirectory: any, azureShare: any, localDirectory: any): Promise<void>{ // tslint:disable:non-literal-fs-path
export async function uploadDirectory(fileServerClient: azureStorage.FileService, azureDirectory: string, azureShare: any,
localDirectory: string): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
const fileNameArray: string[] = fs.readdirSync(localDirectory); const fileNameArray: string[] = fs.readdirSync(localDirectory);
await createDirectoryRecursive(fileServerClient, azureDirectory, azureShare); await createDirectoryRecursive(fileServerClient, azureDirectory, azureShare);
for(let fileName of fileNameArray){ for (const fileName of fileNameArray) {
const fullFilePath: string = path.join(localDirectory, fileName); const fullFilePath: string = path.join(localDirectory, fileName);
try { try {
if (fs.lstatSync(fullFilePath).isFile()) { if (fs.lstatSync(fullFilePath)
.isFile()) {
await uploadFileToAzure(fileServerClient, azureDirectory, fileName, azureShare, fullFilePath); await uploadFileToAzure(fileServerClient, azureDirectory, fileName, azureShare, fullFilePath);
} else { } else {
// If filePath is a directory, recuisively copy it to azure // If filePath is a directory, recuisively copy it to azure
await uploadDirectory(fileServerClient, azureDirectory + '/' + fileName, azureShare, fullFilePath); await uploadDirectory(fileServerClient, String.Format('{0}/{1}', azureDirectory, fileName), azureShare, fullFilePath);
} }
} catch(error) { } catch (error) {
deferred.reject(error); deferred.reject(error);
return deferred.promise; return deferred.promise;
} }
} }
// All files/directories are copied successfully, resolve // All files/directories are copied successfully, resolve
deferred.resolve(); deferred.resolve();
return deferred.promise; return deferred.promise;
} }
/** /**
* downlod a directory from azure * downlod a directory from azure
* @param fileServerClient * @param fileServerClient
* @param azureDirectory * @param azureDirectory
* @param azureShare * @param azureShare
* @param localDirectory * @param localDirectory
*/ */
export async function downloadDirectory(fileServerClient: any, azureDirectory:any, azureShare: any, localDirectory: any): Promise<void>{ export async function downloadDirectory(fileServerClient: any, azureDirectory: string, azureShare: any, localDirectory: string):
Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
mkDirP(localDirectory); await mkDirP(localDirectory);
fileServerClient.listFilesAndDirectoriesSegmented(azureShare, azureDirectory, 'null', function(error: any, result: any, response: any) { fileServerClient.listFilesAndDirectoriesSegmented(azureShare, azureDirectory, 'null',
if(('entries' in result) === false){ async (error: any, result: any, response: any) => {
getLogger().error(`list files failed, can't get entries in result`); if (('entries' in result) === false) {
getLogger()
.error(`list files failed, can't get entries in result`);
throw new Error(`list files failed, can't get entries in result`); throw new Error(`list files failed, can't get entries in result`);
} }
if(('files' in result['entries']) === false){ if (('files' in result.entries) === false) {
getLogger().error(`list files failed, can't get files in result['entries']`); getLogger()
.error(`list files failed, can't get files in result['entries']`);
throw new Error(`list files failed, can't get files in result['entries']`); throw new Error(`list files failed, can't get files in result['entries']`);
} }
if(('directories' in result['directories']) === false){ if (('directories' in result.directories) === false) {
getLogger().error(`list files failed, can't get directories in result['entries']`); getLogger()
.error(`list files failed, can't get directories in result['entries']`);
throw new Error(`list files failed, can't get directories in result['entries']`); throw new Error(`list files failed, can't get directories in result['entries']`);
} }
for(var fileName of result['entries']['files']){ for (const fileName of result.entries.files) {
const fullFilePath: string = path.join(localDirectory, fileName.name); const fullFilePath: string = path.join(localDirectory, fileName.name);
downloadFile(fileServerClient, azureDirectory, fileName.name, azureShare, fullFilePath) await downloadFile(fileServerClient, azureDirectory, fileName.name, azureShare, fullFilePath);
} }
for(var directoryName of result['entries']['directories']){ for (const directoryName of result.entries.directories) {
const fullDirectoryPath: string = path.join(localDirectory, directoryName.name) const fullDirectoryPath: string = path.join(localDirectory, directoryName.name);
const fullAzureDirectory: string = path.join(azureDirectory, directoryName.name) const fullAzureDirectory: string = path.join(azureDirectory, directoryName.name);
downloadDirectory(fileServerClient, fullAzureDirectory, azureShare, fullDirectoryPath) await downloadDirectory(fileServerClient, fullAzureDirectory, azureShare, fullDirectoryPath);
} }
deferred.resolve(); deferred.resolve();
}) });
return deferred.promise; return deferred.promise;
} }
} }
// tslint:enable: no-redundant-jsdoc no-any no-unsafe-any
/** /**
* Copyright (c) Microsoft Corporation * Copyright (c) Microsoft Corporation
* All rights reserved. * All rights reserved.
...@@ -20,21 +21,29 @@ ...@@ -20,21 +21,29 @@
'use strict'; 'use strict';
import * as fs from 'fs'; import * as fs from 'fs';
import { KubernetesCRDClient, GeneralK8sClient } from '../kubernetesApiClient'; import { GeneralK8sClient, KubernetesCRDClient } from '../kubernetesApiClient';
abstract class FrameworkControllerClient extends KubernetesCRDClient{ /**
* FrameworkController Client
*/
abstract class FrameworkControllerClient extends KubernetesCRDClient {
/** /**
* Factory method to generate operator cliet * Factory method to generate operator client
*/ */
// tslint:disable-next-line:function-name
public static generateFrameworkControllerClient(): KubernetesCRDClient { public static generateFrameworkControllerClient(): KubernetesCRDClient {
return new FrameworkControllerClientV1(); return new FrameworkControllerClientV1();
} }
} }
/**
* FrameworkController ClientV1
*/
class FrameworkControllerClientV1 extends FrameworkControllerClient { class FrameworkControllerClientV1 extends FrameworkControllerClient {
/** /**
* constructor, to initialize frameworkcontroller CRD definition * constructor, to initialize frameworkcontroller CRD definition
*/ */
// tslint:disable: no-unsafe-any no-any
public constructor() { public constructor() {
super(); super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json', 'utf8')); this.crdSchema = JSON.parse(fs.readFileSync('./config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json', 'utf8'));
...@@ -42,13 +51,13 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient { ...@@ -42,13 +51,13 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient {
} }
protected get operator(): any { protected get operator(): any {
return this.client.apis["frameworkcontroller.microsoft.com"].v1.namespaces('default').frameworks; return this.client.apis['frameworkcontroller.microsoft.com'].v1.namespaces('default').frameworks;
} }
// tslint:enable: no-unsafe-any no-any
public get containerName(): string { public get containerName(): string {
return 'framework'; return 'framework';
} }
} }
export { FrameworkControllerClient, GeneralK8sClient }; export { FrameworkControllerClient, GeneralK8sClient };
...@@ -20,10 +20,11 @@ ...@@ -20,10 +20,11 @@
'use strict'; 'use strict';
import * as assert from 'assert'; import * as assert from 'assert';
import { KubernetesTrialConfig, KubernetesTrialConfigTemplate, KubernetesClusterConfigAzure, import { AzureStorage, KeyVaultConfig, KubernetesClusterConfig, KubernetesClusterConfigAzure, KubernetesClusterConfigNFS,
KubernetesClusterConfigNFS, NFSConfig, KubernetesStorageKind, keyVaultConfig, AzureStorage, KubernetesClusterConfig, KubernetesStorageKind, KubernetesTrialConfig, KubernetesTrialConfigTemplate, NFSConfig, StorageConfig
StorageConfig } from '../kubernetesConfig' } from '../kubernetesConfig';
// tslint:disable:completed-docs
export class FrameworkAttemptCompletionPolicy { export class FrameworkAttemptCompletionPolicy {
public readonly minFailedTaskCount: number; public readonly minFailedTaskCount: number;
public readonly minSucceededTaskCount: number; public readonly minSucceededTaskCount: number;
...@@ -36,13 +37,13 @@ export class FrameworkAttemptCompletionPolicy { ...@@ -36,13 +37,13 @@ export class FrameworkAttemptCompletionPolicy {
/** /**
* Trial job configuration for FrameworkController * Trial job configuration for FrameworkController
*/ */
export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfigTemplate{ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfigTemplate {
public readonly frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy; public readonly frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy;
public readonly name: string; public readonly name: string;
public readonly taskNum: number; public readonly taskNum: number;
constructor(taskNum: number, command : string, gpuNum : number, constructor(taskNum: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string, cpuNum: number, memoryMB: number, image: string,
frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy) { frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy) {
super(command, gpuNum, cpuNum, memoryMB, image); super(command, gpuNum, cpuNum, memoryMB, image);
this.frameworkAttemptCompletionPolicy = frameworkAttemptCompletionPolicy; this.frameworkAttemptCompletionPolicy = frameworkAttemptCompletionPolicy;
this.name = name; this.name = name;
...@@ -50,7 +51,7 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi ...@@ -50,7 +51,7 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi
} }
} }
export class FrameworkControllerTrialConfig extends KubernetesTrialConfig{ export class FrameworkControllerTrialConfig extends KubernetesTrialConfig {
public readonly taskRoles: FrameworkControllerTrialConfigTemplate[]; public readonly taskRoles: FrameworkControllerTrialConfigTemplate[];
public readonly codeDir: string; public readonly codeDir: string;
constructor(codeDir: string, taskRoles: FrameworkControllerTrialConfigTemplate[]) { constructor(codeDir: string, taskRoles: FrameworkControllerTrialConfigTemplate[]) {
...@@ -68,11 +69,12 @@ export class FrameworkControllerClusterConfig extends KubernetesClusterConfig { ...@@ -68,11 +69,12 @@ export class FrameworkControllerClusterConfig extends KubernetesClusterConfig {
} }
} }
// tslint:disable:function-name
export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfigNFS { export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfigNFS {
public readonly serviceAccountName: string; public readonly serviceAccountName: string;
constructor( constructor(
serviceAccountName: string, serviceAccountName: string,
apiVersion: string, apiVersion: string,
nfs: NFSConfig, nfs: NFSConfig,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind
) { ) {
...@@ -81,8 +83,9 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig ...@@ -81,8 +83,9 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig
} }
public static getInstance(jsonObject: object): FrameworkControllerClusterConfigNFS { public static getInstance(jsonObject: object): FrameworkControllerClusterConfigNFS {
let kubeflowClusterConfigObjectNFS = <FrameworkControllerClusterConfigNFS>jsonObject; const kubeflowClusterConfigObjectNFS: FrameworkControllerClusterConfigNFS = <FrameworkControllerClusterConfigNFS>jsonObject;
assert (kubeflowClusterConfigObjectNFS !== undefined) assert (kubeflowClusterConfigObjectNFS !== undefined);
return new FrameworkControllerClusterConfigNFS( return new FrameworkControllerClusterConfigNFS(
kubeflowClusterConfigObjectNFS.serviceAccountName, kubeflowClusterConfigObjectNFS.serviceAccountName,
kubeflowClusterConfigObjectNFS.apiVersion, kubeflowClusterConfigObjectNFS.apiVersion,
...@@ -94,20 +97,21 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig ...@@ -94,20 +97,21 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig
export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConfigAzure { export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConfigAzure {
public readonly serviceAccountName: string; public readonly serviceAccountName: string;
constructor( constructor(
serviceAccountName: string, serviceAccountName: string,
apiVersion: string, apiVersion: string,
keyVault: keyVaultConfig, keyVault: KeyVaultConfig,
azureStorage: AzureStorage, azureStorage: AzureStorage,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind
) { ) {
super(apiVersion, keyVault, azureStorage,storage); super(apiVersion, keyVault, azureStorage, storage);
this.serviceAccountName = serviceAccountName; this.serviceAccountName = serviceAccountName;
} }
public static getInstance(jsonObject: object): FrameworkControllerClusterConfigAzure { public static getInstance(jsonObject: object): FrameworkControllerClusterConfigAzure {
let kubeflowClusterConfigObjectAzure = <FrameworkControllerClusterConfigAzure>jsonObject; const kubeflowClusterConfigObjectAzure: FrameworkControllerClusterConfigAzure = <FrameworkControllerClusterConfigAzure>jsonObject;
return new FrameworkControllerClusterConfigAzure( return new FrameworkControllerClusterConfigAzure(
kubeflowClusterConfigObjectAzure.serviceAccountName, kubeflowClusterConfigObjectAzure.serviceAccountName,
kubeflowClusterConfigObjectAzure.apiVersion, kubeflowClusterConfigObjectAzure.apiVersion,
...@@ -121,11 +125,11 @@ export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConf ...@@ -121,11 +125,11 @@ export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConf
export class FrameworkControllerClusterConfigFactory { export class FrameworkControllerClusterConfigFactory {
public static generateFrameworkControllerClusterConfig(jsonObject: object): FrameworkControllerClusterConfig { public static generateFrameworkControllerClusterConfig(jsonObject: object): FrameworkControllerClusterConfig {
let storageConfig = <StorageConfig>jsonObject; const storageConfig: StorageConfig = <StorageConfig>jsonObject;
if(!storageConfig) { if (storageConfig === undefined) {
throw new Error("Invalid json object as a StorageConfig instance"); throw new Error('Invalid json object as a StorageConfig instance');
} }
if(storageConfig.storage && storageConfig.storage === 'azureStorage') { if (storageConfig.storage !== undefined && storageConfig.storage === 'azureStorage') {
return FrameworkControllerClusterConfigAzure.getInstance(jsonObject); return FrameworkControllerClusterConfigAzure.getInstance(jsonObject);
} else if (storageConfig.storage === undefined || storageConfig.storage === 'nfs') { } else if (storageConfig.storage === undefined || storageConfig.storage === 'nfs') {
return FrameworkControllerClusterConfigNFS.getInstance(jsonObject); return FrameworkControllerClusterConfigNFS.getInstance(jsonObject);
...@@ -134,6 +138,7 @@ export class FrameworkControllerClusterConfigFactory { ...@@ -134,6 +138,7 @@ export class FrameworkControllerClusterConfigFactory {
} }
} }
export type FrameworkControllerJobStatus = 'AttemptRunning' | 'Completed' | 'AttemptCreationPending' | 'AttemptCreationRequested' | 'AttemptPreparing' | 'AttemptCompleted'; export type FrameworkControllerJobStatus =
'AttemptRunning' | 'Completed' | 'AttemptCreationPending' | 'AttemptCreationRequested' | 'AttemptPreparing' | 'AttemptCompleted';
export type FrameworkControllerJobCompleteStatus = 'Succeeded' | 'Failed'; export type FrameworkControllerJobCompleteStatus = 'Succeeded' | 'Failed';
\ No newline at end of file
...@@ -19,66 +19,74 @@ ...@@ -19,66 +19,74 @@
'use strict'; 'use strict';
import { KubernetesTrialJobDetail} from '../kubernetesData';
import { KubernetesCRDClient } from '../kubernetesApiClient'; import { KubernetesCRDClient } from '../kubernetesApiClient';
import { KubernetesTrialJobDetail} from '../kubernetesData';
import { KubernetesJobInfoCollector } from '../kubernetesJobInfoCollector'; import { KubernetesJobInfoCollector } from '../kubernetesJobInfoCollector';
import { FrameworkControllerJobStatus, FrameworkControllerJobCompleteStatus } from './frameworkcontrollerConfig'; import { FrameworkControllerJobCompleteStatus, FrameworkControllerJobStatus } from './frameworkcontrollerConfig';
/** /**
* Collector frameworkcontroller jobs info from Kubernetes cluster, and update frameworkcontroller job status locally * Collector frameworkcontroller jobs info from Kubernetes cluster, and update frameworkcontroller job status locally
*/ */
export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollector{ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollector {
constructor(jobMap: Map<string, KubernetesTrialJobDetail>) { constructor(jobMap: Map<string, KubernetesTrialJobDetail>) {
super(jobMap); super(jobMap);
} }
protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined, protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> { kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> {
if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) { if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) {
return Promise.resolve(); return Promise.resolve();
} }
if(kubernetesCRDClient === undefined) { if (kubernetesCRDClient === undefined) {
return Promise.reject('kubernetesCRDClient is undefined'); return Promise.reject('kubernetesCRDClient is undefined');
} }
// tslint:disable-next-line:no-any
let kubernetesJobInfo: any; let kubernetesJobInfo: any;
try { try {
kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName); kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName);
} catch(error) { } catch (error) {
this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`); this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`);
//This is not treat as a error status //This is not treat as a error status
return Promise.resolve(); return Promise.resolve();
} }
if(kubernetesJobInfo.status && kubernetesJobInfo.status.state) { // tslint:disable: no-unsafe-any
if (kubernetesJobInfo.status && kubernetesJobInfo.status.state) {
const frameworkJobType: FrameworkControllerJobStatus = <FrameworkControllerJobStatus>kubernetesJobInfo.status.state; const frameworkJobType: FrameworkControllerJobStatus = <FrameworkControllerJobStatus>kubernetesJobInfo.status.state;
switch(frameworkJobType) { switch (frameworkJobType) {
case 'AttemptCreationPending' || 'AttemptCreationRequested' || 'AttemptPreparing': case 'AttemptCreationPending':
case 'AttemptCreationRequested':
case 'AttemptPreparing':
kubernetesTrialJob.status = 'WAITING'; kubernetesTrialJob.status = 'WAITING';
break; break;
case 'AttemptRunning': case 'AttemptRunning':
kubernetesTrialJob.status = 'RUNNING'; kubernetesTrialJob.status = 'RUNNING';
if(!kubernetesTrialJob.startTime) { if (kubernetesTrialJob.startTime === undefined) {
kubernetesTrialJob.startTime = Date.parse(<string>kubernetesJobInfo.status.startTime); kubernetesTrialJob.startTime = Date.parse(<string>kubernetesJobInfo.status.startTime);
} }
break; break;
case 'Completed': case 'Completed':
const completedJobType : FrameworkControllerJobCompleteStatus = <FrameworkControllerJobCompleteStatus>kubernetesJobInfo.status.attemptStatus.completionStatus.type.name; const completedJobType : FrameworkControllerJobCompleteStatus =
switch(completedJobType) { <FrameworkControllerJobCompleteStatus>kubernetesJobInfo.status.attemptStatus.completionStatus.type.name;
switch (completedJobType) {
case 'Succeeded': case 'Succeeded':
kubernetesTrialJob.status = 'SUCCEEDED'; kubernetesTrialJob.status = 'SUCCEEDED';
break; break;
case 'Failed': case 'Failed':
kubernetesTrialJob.status = 'FAILED'; kubernetesTrialJob.status = 'FAILED';
break; break;
default:
} }
kubernetesTrialJob.endTime = Date.parse(<string>kubernetesJobInfo.status.completionTime); kubernetesTrialJob.endTime = Date.parse(<string>kubernetesJobInfo.status.completionTime);
break; break;
default: default:
break;
} }
} }
return Promise.resolve(); return Promise.resolve();
} }
} // tslint:enable: no-unsafe-any
\ No newline at end of file }
...@@ -20,16 +20,16 @@ ...@@ -20,16 +20,16 @@
'use strict'; 'use strict';
import * as component from '../../../common/component'; import * as component from '../../../common/component';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer';
import { FrameworkControllerTrainingService } from './frameworkcontrollerTrainingService'; import { FrameworkControllerTrainingService } from './frameworkcontrollerTrainingService';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
/** /**
* frameworkcontroller Training service Rest server, provides rest API to support frameworkcontroller job metrics update * frameworkcontroller Training service Rest server, provides rest API to support frameworkcontroller job metrics update
* *
*/ */
@component.Singleton @component.Singleton
export class FrameworkControllerJobRestServer extends KubernetesJobRestServer{ export class FrameworkControllerJobRestServer extends KubernetesJobRestServer {
constructor() { constructor() {
super(component.get(FrameworkControllerTrainingService)); super(component.get(FrameworkControllerTrainingService));
} }
} }
\ No newline at end of file
...@@ -17,31 +17,29 @@ ...@@ -17,31 +17,29 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/ */
'use strict' 'use strict';
import * as component from '../../../common/component';
import * as cpp from 'child-process-promise'; import * as cpp from 'child-process-promise';
import * as fs from 'fs'; import * as fs from 'fs';
import * as path from 'path'; import * as path from 'path';
import * as component from '../../../common/component';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
import { getExperimentId } from '../../../common/experimentStartupInfo'; import { getExperimentId } from '../../../common/experimentStartupInfo';
import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
import { import {
JobApplicationForm, TrialJobApplicationForm, JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail
TrialJobDetail, NNIManagerIpConfig
} from '../../../common/trainingService'; } from '../../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils'; import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils';
import { NFSConfig } from '../kubernetesConfig' import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
import { KubernetesTrialJobDetail } from '../kubernetesData'; import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
import { validateCodeDir } from '../../common/util'; import { validateCodeDir } from '../../common/util';
import { AzureStorageClientUtility } from '../azureStorageClientUtils'; import { AzureStorageClientUtility } from '../azureStorageClientUtils';
import { NFSConfig } from '../kubernetesConfig';
import { KubernetesTrialJobDetail } from '../kubernetesData';
import { KubernetesTrainingService } from '../kubernetesTrainingService'; import { KubernetesTrainingService } from '../kubernetesTrainingService';
import { FrameworkControllerTrialConfig, FrameworkControllerClusterConfig, FrameworkControllerClusterConfigAzure, FrameworkControllerClusterConfigNFS,
FrameworkControllerClusterConfigFactory} from './frameworkcontrollerConfig';
import { FrameworkControllerJobRestServer } from './frameworkcontrollerJobRestServer';
import { FrameworkControllerClient } from './frameworkcontrollerApiClient'; import { FrameworkControllerClient } from './frameworkcontrollerApiClient';
import { FrameworkControllerClusterConfig, FrameworkControllerClusterConfigAzure, FrameworkControllerClusterConfigFactory,
FrameworkControllerClusterConfigNFS, FrameworkControllerTrialConfig} from './frameworkcontrollerConfig';
import { FrameworkControllerJobInfoCollector } from './frameworkcontrollerJobInfoCollector'; import { FrameworkControllerJobInfoCollector } from './frameworkcontrollerJobInfoCollector';
import { FrameworkControllerJobRestServer } from './frameworkcontrollerJobRestServer';
/** /**
* Training Service implementation for frameworkcontroller * Training Service implementation for frameworkcontroller
...@@ -49,30 +47,30 @@ import { FrameworkControllerJobInfoCollector } from './frameworkcontrollerJobInf ...@@ -49,30 +47,30 @@ import { FrameworkControllerJobInfoCollector } from './frameworkcontrollerJobInf
@component.Singleton @component.Singleton
class FrameworkControllerTrainingService extends KubernetesTrainingService implements KubernetesTrainingService { class FrameworkControllerTrainingService extends KubernetesTrainingService implements KubernetesTrainingService {
private fcTrialConfig?: FrameworkControllerTrialConfig; // frameworkcontroller trial configuration private fcTrialConfig?: FrameworkControllerTrialConfig; // frameworkcontroller trial configuration
private fcJobInfoCollector: FrameworkControllerJobInfoCollector; // frameworkcontroller job info collector private readonly fcJobInfoCollector: FrameworkControllerJobInfoCollector; // frameworkcontroller job info collector
private fcContainerPortMap = new Map<string, number>(); // store frameworkcontroller container port private readonly fcContainerPortMap: Map<string, number> = new Map<string, number>(); // store frameworkcontroller container port
private fcClusterConfig?: FrameworkControllerClusterConfig; private fcClusterConfig?: FrameworkControllerClusterConfig;
constructor() { constructor() {
super(); super();
this.fcJobInfoCollector = new FrameworkControllerJobInfoCollector(this.trialJobsMap); this.fcJobInfoCollector = new FrameworkControllerJobInfoCollector(this.trialJobsMap);
this.experimentId = getExperimentId(); this.experimentId = getExperimentId();
this.nextTrialSequenceId = -1; this.nextTrialSequenceId = -1;
} }
public async run(): Promise<void> { public async run(): Promise<void> {
this.kubernetesJobRestServer = component.get(FrameworkControllerJobRestServer); this.kubernetesJobRestServer = component.get(FrameworkControllerJobRestServer);
if(!this.kubernetesJobRestServer) { if (this.kubernetesJobRestServer === undefined) {
throw new Error('kubernetesJobRestServer not initialized!'); throw new Error('kubernetesJobRestServer not initialized!');
} }
await this.kubernetesJobRestServer.start(); await this.kubernetesJobRestServer.start();
this.kubernetesJobRestServer.setEnableVersionCheck = this.versionCheck; this.kubernetesJobRestServer.setEnableVersionCheck = this.versionCheck;
this.log.info(`frameworkcontroller Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`); this.log.info(`frameworkcontroller Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`);
while (!this.stopping) { while (!this.stopping) {
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server // collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
await delay(3000); await delay(3000);
await this.fcJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient); await this.fcJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient);
if(this.kubernetesJobRestServer.getErrorMessage) { if (this.kubernetesJobRestServer.getErrorMessage !== undefined) {
throw new Error(this.kubernetesJobRestServer.getErrorMessage); throw new Error(this.kubernetesJobRestServer.getErrorMessage);
this.stopping = true; this.stopping = true;
} }
...@@ -80,14 +78,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -80,14 +78,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
} }
public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> { public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> {
if(!this.fcClusterConfig) { if (this.fcClusterConfig === undefined) {
throw new Error('frameworkcontrollerClusterConfig is not initialized'); throw new Error('frameworkcontrollerClusterConfig is not initialized');
} }
if(!this.kubernetesCRDClient) { if (this.kubernetesCRDClient === undefined) {
throw new Error('kubernetesCRDClient is undefined'); throw new Error('kubernetesCRDClient is undefined');
} }
if(!this.kubernetesRestServerPort) { if (this.kubernetesRestServerPort === undefined) {
const restServer: FrameworkControllerJobRestServer = component.get(FrameworkControllerJobRestServer); const restServer: FrameworkControllerJobRestServer = component.get(FrameworkControllerJobRestServer);
this.kubernetesRestServerPort = restServer.clusterRestServerPort; this.kubernetesRestServerPort = restServer.clusterRestServerPort;
} }
...@@ -97,14 +95,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -97,14 +95,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
// Set trial's NFS working folder // Set trial's NFS working folder
const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId); const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId);
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId); const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
const frameworkcontrollerJobName = `nniexp${this.experimentId}trial${trialJobId}`.toLowerCase(); const frameworkcontrollerJobName: string = `nniexp${this.experimentId}trial${trialJobId}`.toLowerCase();
//Generate the port used for taskRole //Generate the port used for taskRole
this.generateContainerPort(); this.generateContainerPort();
await this.prepareRunScript(trialLocalTempFolder, curTrialSequenceId, trialJobId, trialWorkingFolder, form); await this.prepareRunScript(trialLocalTempFolder, curTrialSequenceId, trialJobId, trialWorkingFolder, form);
//upload code files //upload code files
let trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder);
const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail( const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail(
trialJobId, trialJobId,
'WAITING', 'WAITING',
...@@ -116,182 +114,202 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -116,182 +114,202 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
trialJobOutputUrl trialJobOutputUrl
); );
// Set trial job detail until create frameworkcontroller job successfully // Set trial job detail until create frameworkcontroller job successfully
this.trialJobsMap.set(trialJobId, trialJobDetail); this.trialJobsMap.set(trialJobId, trialJobDetail);
// Create frameworkcontroller job based on generated frameworkcontroller job resource config // Create frameworkcontroller job based on generated frameworkcontroller job resource config
const frameworkcontrollerJobConfig = await this.prepareFrameworkControllerConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName); // tslint:disable-next-line:no-any
const frameworkcontrollerJobConfig: any = await this.prepareFrameworkControllerConfig(
trialJobId, trialWorkingFolder, frameworkcontrollerJobName);
await this.kubernetesCRDClient.createKubernetesJob(frameworkcontrollerJobConfig); await this.kubernetesCRDClient.createKubernetesJob(frameworkcontrollerJobConfig);
// Set trial job detail until create frameworkcontroller job successfully // Set trial job detail until create frameworkcontroller job successfully
this.trialJobsMap.set(trialJobId, trialJobDetail); this.trialJobsMap.set(trialJobId, trialJobDetail);
return Promise.resolve(trialJobDetail); return Promise.resolve(trialJobDetail);
} }
// tslint:disable:no-redundant-jsdoc no-any no-unsafe-any
public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) {
case TrialConfigMetadataKey.NNI_MANAGER_IP:
this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value);
break;
case TrialConfigMetadataKey.FRAMEWORKCONTROLLER_CLUSTER_CONFIG:
const frameworkcontrollerClusterJsonObject: any = JSON.parse(value);
this.fcClusterConfig = FrameworkControllerClusterConfigFactory
.generateFrameworkControllerClusterConfig(frameworkcontrollerClusterJsonObject);
if (this.fcClusterConfig.storageType === 'azureStorage') {
const azureFrameworkControllerClusterConfig: FrameworkControllerClusterConfigAzure =
<FrameworkControllerClusterConfigAzure>this.fcClusterConfig;
this.azureStorageAccountName = azureFrameworkControllerClusterConfig.azureStorage.accountName;
this.azureStorageShare = azureFrameworkControllerClusterConfig.azureStorage.azureShare;
await this.createAzureStorage(
azureFrameworkControllerClusterConfig.keyVault.vaultName,
azureFrameworkControllerClusterConfig.keyVault.name,
azureFrameworkControllerClusterConfig.azureStorage.accountName,
azureFrameworkControllerClusterConfig.azureStorage.azureShare
);
} else if (this.fcClusterConfig.storageType === 'nfs') {
const nfsFrameworkControllerClusterConfig: FrameworkControllerClusterConfigNFS =
<FrameworkControllerClusterConfigNFS>this.fcClusterConfig;
await this.createNFSStorage(
nfsFrameworkControllerClusterConfig.nfs.server,
nfsFrameworkControllerClusterConfig.nfs.path
);
}
this.kubernetesCRDClient = FrameworkControllerClient.generateFrameworkControllerClient();
break;
case TrialConfigMetadataKey.TRIAL_CONFIG:
const frameworkcontrollerTrialJsonObjsect: any = JSON.parse(value);
this.fcTrialConfig = new FrameworkControllerTrialConfig(
frameworkcontrollerTrialJsonObjsect.codeDir,
frameworkcontrollerTrialJsonObjsect.taskRoles
);
// Validate to make sure codeDir doesn't have too many files
try {
await validateCodeDir(this.fcTrialConfig.codeDir);
} catch (error) {
this.log.error(error);
return Promise.reject(new Error(error));
}
break;
case TrialConfigMetadataKey.VERSION_CHECK:
this.versionCheck = (value === 'true' || value === 'True');
break;
case TrialConfigMetadataKey.LOG_COLLECTION:
this.logCollection = value;
break;
default:
}
return Promise.resolve();
}
// tslint:enable: no-any no-unsafe-any
/** /**
* upload code files to nfs or azureStroage * upload code files to nfs or azureStroage
* @param trialJobId * @param trialJobId
* @param trialLocalTempFolder * @param trialLocalTempFolder
* return: trialJobOutputUrl * return: trialJobOutputUrl
*/ */
private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise<string> { private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise<string> {
if(!this.fcClusterConfig) { if (this.fcClusterConfig === undefined) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
let trialJobOutputUrl: string = ''; let trialJobOutputUrl: string = '';
if(this.fcClusterConfig.storageType === 'azureStorage') { if (this.fcClusterConfig.storageType === 'azureStorage') {
try{ if (this.azureStorageClient === undefined) {
throw new Error('azureStorageClient is not initialized');
}
try {
//upload local files to azure storage //upload local files to azure storage
await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, await AzureStorageClientUtility.uploadDirectory(
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`); this.azureStorageClient, `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`);
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}` trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/\
}catch(error){ ${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}`;
} catch (error) {
this.log.error(error); this.log.error(error);
return Promise.reject(error); return Promise.reject(error);
} }
} else if(this.fcClusterConfig.storageType === 'nfs') { } else if (this.fcClusterConfig.storageType === 'nfs') {
let nfsFrameworkControllerClusterConfig: FrameworkControllerClusterConfigNFS = <FrameworkControllerClusterConfigNFS>this.fcClusterConfig; const nfsFrameworkControllerClusterConfig: FrameworkControllerClusterConfigNFS =
// Creat work dir for current trial in NFS directory <FrameworkControllerClusterConfigNFS>this.fcClusterConfig;
// Creat work dir for current trial in NFS directory
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`); await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`);
// Copy code files from local dir to NFS mounted dir // Copy code files from local dir to NFS mounted dir
await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`); await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
const nfsConfig: NFSConfig = nfsFrameworkControllerClusterConfig.nfs; const nfsConfig: NFSConfig = nfsFrameworkControllerClusterConfig.nfs;
trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}` trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`;
} }
return Promise.resolve(trialJobOutputUrl); return Promise.resolve(trialJobOutputUrl);
} }
/** /**
* generate trial's command for frameworkcontroller * generate trial's command for frameworkcontroller
* expose port and execute injector.sh before executing user's command * expose port and execute injector.sh before executing user's command
* @param command * @param command
*/ */
private generateCommandScript(command: string): string { private generateCommandScript(command: string): string {
let portScript = ''; let portScript: string = '';
if(!this.fcTrialConfig) { if (this.fcTrialConfig === undefined) {
throw new Error('frameworkcontroller trial config is not initialized'); throw new Error('frameworkcontroller trial config is not initialized');
} }
for(let taskRole of this.fcTrialConfig.taskRoles) { for (const taskRole of this.fcTrialConfig.taskRoles) {
portScript += `FB_${taskRole.name.toUpperCase()}_PORT=${this.fcContainerPortMap.get(taskRole.name)} `; portScript += `FB_${taskRole.name.toUpperCase()}_PORT=${this.fcContainerPortMap.get(taskRole.name)} `;
} }
return `${portScript} . /mnt/frameworkbarrier/injector.sh && ${command}`; return `${portScript} . /mnt/frameworkbarrier/injector.sh && ${command}`;
} }
private async prepareRunScript(trialLocalTempFolder: string, curTrialSequenceId: number, trialJobId: string, trialWorkingFolder: string, form: JobApplicationForm): Promise<void> { private async prepareRunScript(trialLocalTempFolder: string, curTrialSequenceId: number, trialJobId: string,
if(!this.fcTrialConfig) { trialWorkingFolder: string, form: JobApplicationForm): Promise<void> {
if (this.fcTrialConfig === undefined) {
throw new Error('frameworkcontroller trial config is not initialized'); throw new Error('frameworkcontroller trial config is not initialized');
} }
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`); await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${this.fcTrialConfig.codeDir} ${trialLocalTempFolder}`); await cpp.exec(`cp -r ${this.fcTrialConfig.codeDir} ${trialLocalTempFolder}`);
const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT; const installScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files // Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), installScriptContent, { encoding: 'utf8' });
// Create tmp trial working folder locally. // Create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`); await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
for(let taskRole of this.fcTrialConfig.taskRoles) { for (const taskRole of this.fcTrialConfig.taskRoles) {
const runScriptContent: string = await this.generateRunScript('frameworkcontroller', trialJobId, trialWorkingFolder, const runScriptContent: string =
this.generateCommandScript(taskRole.command), curTrialSequenceId.toString(), taskRole.name, taskRole.gpuNum); await this.generateRunScript('frameworkcontroller', trialJobId, trialWorkingFolder,
this.generateCommandScript(taskRole.command), curTrialSequenceId.toString(),
taskRole.name, taskRole.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, `run_${taskRole.name}.sh`), runScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, `run_${taskRole.name}.sh`), runScriptContent, { encoding: 'utf8' });
} }
// Write file content ( parameter.cfg ) to local tmp folders // Write file content ( parameter.cfg ) to local tmp folders
const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form) const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form);
if(trialForm && trialForm.hyperParameters) { if (trialForm !== undefined && trialForm.hyperParameters !== undefined) {
await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)), await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)),
trialForm.hyperParameters.value, { encoding: 'utf8' }); trialForm.hyperParameters.value, { encoding: 'utf8' });
} }
} }
private async prepareFrameworkControllerConfig(trialJobId: string, trialWorkingFolder: string, frameworkcontrollerJobName: string): Promise<any> {
if(!this.fcTrialConfig) { // tslint:disable: no-any no-unsafe-any
private async prepareFrameworkControllerConfig(trialJobId: string, trialWorkingFolder: string, frameworkcontrollerJobName: string):
Promise<any> {
if (this.fcTrialConfig === undefined) {
throw new Error('frameworkcontroller trial config is not initialized'); throw new Error('frameworkcontroller trial config is not initialized');
} }
const podResources : any = []; const podResources : any = [];
for(let taskRole of this.fcTrialConfig.taskRoles) { for (const taskRole of this.fcTrialConfig.taskRoles) {
let resource: any = {}; const resource: any = {};
resource.requests = this.generatePodResource(taskRole.memoryMB, taskRole.cpuNum, taskRole.gpuNum); resource.requests = this.generatePodResource(taskRole.memoryMB, taskRole.cpuNum, taskRole.gpuNum);
resource.limits = Object.assign({}, resource.requests); resource.limits = {...resource.requests};
podResources.push(resource); podResources.push(resource);
} }
// Generate frameworkcontroller job resource config object // Generate frameworkcontroller job resource config object
const frameworkcontrollerJobConfig: any = this.generateFrameworkControllerJobConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName, podResources); const frameworkcontrollerJobConfig: any =
this.generateFrameworkControllerJobConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName, podResources);
return Promise.resolve(frameworkcontrollerJobConfig); return Promise.resolve(frameworkcontrollerJobConfig);
}
public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) {
case TrialConfigMetadataKey.NNI_MANAGER_IP:
this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value);
break;
case TrialConfigMetadataKey.FRAMEWORKCONTROLLER_CLUSTER_CONFIG:
let frameworkcontrollerClusterJsonObject = JSON.parse(value);
this.fcClusterConfig = FrameworkControllerClusterConfigFactory.generateFrameworkControllerClusterConfig(frameworkcontrollerClusterJsonObject);
if(this.fcClusterConfig.storageType === 'azureStorage') {
let azureFrameworkControllerClusterConfig = <FrameworkControllerClusterConfigAzure>this.fcClusterConfig;
this.azureStorageAccountName = azureFrameworkControllerClusterConfig.azureStorage.accountName;
this.azureStorageShare = azureFrameworkControllerClusterConfig.azureStorage.azureShare;
await this.createAzureStorage(
azureFrameworkControllerClusterConfig.keyVault.vaultName,
azureFrameworkControllerClusterConfig.keyVault.name,
azureFrameworkControllerClusterConfig.azureStorage.accountName,
azureFrameworkControllerClusterConfig.azureStorage.azureShare
);
} else if(this.fcClusterConfig.storageType === 'nfs') {
let nfsFrameworkControllerClusterConfig = <FrameworkControllerClusterConfigNFS>this.fcClusterConfig;
await this.createNFSStorage(
nfsFrameworkControllerClusterConfig.nfs.server,
nfsFrameworkControllerClusterConfig.nfs.path
);
}
this.kubernetesCRDClient = FrameworkControllerClient.generateFrameworkControllerClient();
break;
case TrialConfigMetadataKey.TRIAL_CONFIG:
let frameworkcontrollerTrialJsonObjsect = JSON.parse(value);
this.fcTrialConfig = new FrameworkControllerTrialConfig(
frameworkcontrollerTrialJsonObjsect.codeDir,
frameworkcontrollerTrialJsonObjsect.taskRoles
);
// Validate to make sure codeDir doesn't have too many files
try {
await validateCodeDir(this.fcTrialConfig.codeDir);
} catch(error) {
this.log.error(error);
return Promise.reject(new Error(error));
}
break;
case TrialConfigMetadataKey.VERSION_CHECK:
this.versionCheck = (value === 'true' || value === 'True');
break;
case TrialConfigMetadataKey.LOG_COLLECTION:
this.logCollection = value;
break;
default:
break;
}
return Promise.resolve();
} }
private generateContainerPort() { private generateContainerPort(): void {
if(!this.fcTrialConfig) { if (this.fcTrialConfig === undefined) {
throw new Error('frameworkcontroller trial config is not initialized'); throw new Error('frameworkcontroller trial config is not initialized');
} }
let port = 4000; //The default port used in container let port: number = 4000; //The default port used in container
for(let index in this.fcTrialConfig.taskRoles) { for (const index of this.fcTrialConfig.taskRoles.keys()) {
this.fcContainerPortMap.set(this.fcTrialConfig.taskRoles[index].name, port); this.fcContainerPortMap.set(this.fcTrialConfig.taskRoles[index].name, port);
port += 1; port += 1;
} }
...@@ -304,24 +322,25 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -304,24 +322,25 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
* @param frameworkcontrollerJobName job name * @param frameworkcontrollerJobName job name
* @param podResources pod template * @param podResources pod template
*/ */
private generateFrameworkControllerJobConfig(trialJobId: string, trialWorkingFolder: string, frameworkcontrollerJobName : string, podResources : any) : any { private generateFrameworkControllerJobConfig(trialJobId: string, trialWorkingFolder: string,
if(!this.fcClusterConfig) { frameworkcontrollerJobName : string, podResources : any) : any {
if (this.fcClusterConfig === undefined) {
throw new Error('frameworkcontroller Cluster config is not initialized'); throw new Error('frameworkcontroller Cluster config is not initialized');
} }
if(!this.fcTrialConfig) { if (this.fcTrialConfig === undefined) {
throw new Error('frameworkcontroller trial config is not initialized'); throw new Error('frameworkcontroller trial config is not initialized');
} }
let taskRoles = []; const taskRoles: any = [];
for(let index in this.fcTrialConfig.taskRoles) { for (const index of this.fcTrialConfig.taskRoles.keys()) {
let containerPort = this.fcContainerPortMap.get(this.fcTrialConfig.taskRoles[index].name); const containerPort: number | undefined = this.fcContainerPortMap.get(this.fcTrialConfig.taskRoles[index].name);
if(!containerPort) { if (containerPort === undefined) {
throw new Error('Container port is not initialized'); throw new Error('Container port is not initialized');
} }
let taskRole = this.generateTaskRoleConfig( const taskRole: any = this.generateTaskRoleConfig(
trialWorkingFolder, trialWorkingFolder,
this.fcTrialConfig.taskRoles[index].image, this.fcTrialConfig.taskRoles[index].image,
`run_${this.fcTrialConfig.taskRoles[index].name}.sh`, `run_${this.fcTrialConfig.taskRoles[index].name}.sh`,
podResources[index], podResources[index],
containerPort containerPort
...@@ -330,17 +349,17 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -330,17 +349,17 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
name: this.fcTrialConfig.taskRoles[index].name, name: this.fcTrialConfig.taskRoles[index].name,
taskNumber: this.fcTrialConfig.taskRoles[index].taskNum, taskNumber: this.fcTrialConfig.taskRoles[index].taskNum,
frameworkAttemptCompletionPolicy: { frameworkAttemptCompletionPolicy: {
minFailedTaskCount: this.fcTrialConfig.taskRoles[index].frameworkAttemptCompletionPolicy.minFailedTaskCount, minFailedTaskCount: this.fcTrialConfig.taskRoles[index].frameworkAttemptCompletionPolicy.minFailedTaskCount,
minSucceededTaskCount: this.fcTrialConfig.taskRoles[index].frameworkAttemptCompletionPolicy.minSucceededTaskCount minSucceededTaskCount: this.fcTrialConfig.taskRoles[index].frameworkAttemptCompletionPolicy.minSucceededTaskCount
}, },
task: taskRole task: taskRole
}); });
} }
return { return {
apiVersion: `frameworkcontroller.microsoft.com/v1`, apiVersion: `frameworkcontroller.microsoft.com/v1`,
kind: 'Framework', kind: 'Framework',
metadata: { metadata: {
name: frameworkcontrollerJobName, name: frameworkcontrollerJobName,
namespace: 'default', namespace: 'default',
labels: { labels: {
...@@ -356,19 +375,18 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -356,19 +375,18 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}; };
} }
private generateTaskRoleConfig(trialWorkingFolder: string, replicaImage: string, runScriptFile: string,
podResources: any, containerPort: number): any {
private generateTaskRoleConfig(trialWorkingFolder: string, replicaImage: string, runScriptFile: string, podResources: any, containerPort: number): any { if (this.fcClusterConfig === undefined) {
if(!this.fcClusterConfig) {
throw new Error('frameworkcontroller Cluster config is not initialized'); throw new Error('frameworkcontroller Cluster config is not initialized');
} }
if(!this.fcTrialConfig) { if (this.fcTrialConfig === undefined) {
throw new Error('frameworkcontroller trial config is not initialized'); throw new Error('frameworkcontroller trial config is not initialized');
} }
let volumeSpecMap = new Map<string, object>(); const volumeSpecMap: Map<string, object> = new Map<string, object>();
if(this.fcClusterConfig.storageType === 'azureStorage'){ if (this.fcClusterConfig.storageType === 'azureStorage') {
volumeSpecMap.set('nniVolumes', [ volumeSpecMap.set('nniVolumes', [
{ {
name: 'nni-vol', name: 'nni-vol',
...@@ -380,9 +398,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -380,9 +398,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}, { }, {
name: 'frameworkbarrier-volume', name: 'frameworkbarrier-volume',
emptyDir: {} emptyDir: {}
}]) }]);
}else { } else {
let frameworkcontrollerClusterConfigNFS: FrameworkControllerClusterConfigNFS = <FrameworkControllerClusterConfigNFS> this.fcClusterConfig; const frameworkcontrollerClusterConfigNFS: FrameworkControllerClusterConfigNFS =
<FrameworkControllerClusterConfigNFS> this.fcClusterConfig;
volumeSpecMap.set('nniVolumes', [ volumeSpecMap.set('nniVolumes', [
{ {
name: 'nni-vol', name: 'nni-vol',
...@@ -393,19 +412,19 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -393,19 +412,19 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}, { }, {
name: 'frameworkbarrier-volume', name: 'frameworkbarrier-volume',
emptyDir: {} emptyDir: {}
}]) }]);
} }
let containers = [ const containers: any = [
{ {
name: 'framework', name: 'framework',
image: replicaImage, image: replicaImage,
command: ["sh", `${path.join(trialWorkingFolder, runScriptFile)}`], command: ['sh', `${path.join(trialWorkingFolder, runScriptFile)}`],
volumeMounts: [ volumeMounts: [
{ {
name: 'nni-vol', name: 'nni-vol',
mountPath: this.CONTAINER_MOUNT_PATH mountPath: this.CONTAINER_MOUNT_PATH
},{ }, {
name: 'frameworkbarrier-volume', name: 'frameworkbarrier-volume',
mountPath: '/mnt/frameworkbarrier' mountPath: '/mnt/frameworkbarrier'
}], }],
...@@ -413,35 +432,36 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -413,35 +432,36 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
ports: [{ ports: [{
containerPort: containerPort containerPort: containerPort
}] }]
}] }];
let initContainers = [ const initContainers: any = [
{ {
name: 'frameworkbarrier', name: 'frameworkbarrier',
image: 'frameworkcontroller/frameworkbarrier', image: 'frameworkcontroller/frameworkbarrier',
volumeMounts: [ volumeMounts: [
{ {
name: 'frameworkbarrier-volume', name: 'frameworkbarrier-volume',
mountPath: '/mnt/frameworkbarrier' mountPath: '/mnt/frameworkbarrier'
}] }]
}] }];
let spec: any = { const spec: any = {
containers: containers, containers: containers,
initContainers: initContainers, initContainers: initContainers,
restartPolicy: 'OnFailure', restartPolicy: 'OnFailure',
volumes: volumeSpecMap.get('nniVolumes'), volumes: volumeSpecMap.get('nniVolumes'),
hostNetwork: false hostNetwork: false
}; };
if(this.fcClusterConfig.serviceAccountName) { if (this.fcClusterConfig.serviceAccountName !== undefined) {
spec.serviceAccountName = this.fcClusterConfig.serviceAccountName; spec.serviceAccountName = this.fcClusterConfig.serviceAccountName;
} }
let taskRole = {
return {
pod: { pod: {
spec: spec spec: spec
} }
} };
return taskRole;
} }
// tslint:enable: no-any no-unsafe-any
} }
export { FrameworkControllerTrainingService } export { FrameworkControllerTrainingService };
...@@ -20,18 +20,22 @@ ...@@ -20,18 +20,22 @@
'use strict'; 'use strict';
import * as fs from 'fs'; import * as fs from 'fs';
import { GeneralK8sClient, KubernetesCRDClient } from '../kubernetesApiClient';
import { KubeflowOperator } from './kubeflowConfig'; import { KubeflowOperator } from './kubeflowConfig';
import { KubernetesCRDClient, GeneralK8sClient } from '../kubernetesApiClient';
abstract class KubeflowOperatorClient extends KubernetesCRDClient{ /**
* KubeflowOperator Client
*/
abstract class KubeflowOperatorClient extends KubernetesCRDClient {
/** /**
* Factory method to generate operator cliet * Factory method to generate operator client
*/ */
public static generateOperatorClient(kubeflowOperator: KubeflowOperator, // tslint:disable-next-line:function-name
operatorApiVersion: string): KubernetesCRDClient { public static generateOperatorClient(kubeflowOperator: KubeflowOperator,
switch(kubeflowOperator) { operatorApiVersion: string): KubernetesCRDClient {
switch (kubeflowOperator) {
case 'tf-operator': { case 'tf-operator': {
switch(operatorApiVersion) { switch (operatorApiVersion) {
case 'v1alpha2': { case 'v1alpha2': {
return new TFOperatorClientV1Alpha2(); return new TFOperatorClientV1Alpha2();
} }
...@@ -41,11 +45,12 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{ ...@@ -41,11 +45,12 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
case 'v1beta2': { case 'v1beta2': {
return new TFOperatorClientV1Beta2(); return new TFOperatorClientV1Beta2();
} }
default:
throw new Error(`Invalid tf-operator apiVersion ${operatorApiVersion}`);
} }
break;
} }
case 'pytorch-operator': { case 'pytorch-operator': {
switch(operatorApiVersion) { switch (operatorApiVersion) {
case 'v1alpha2': { case 'v1alpha2': {
return new PyTorchOperatorClientV1Alpha2(); return new PyTorchOperatorClientV1Alpha2();
} }
...@@ -55,13 +60,17 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{ ...@@ -55,13 +60,17 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
case 'v1beta2': { case 'v1beta2': {
return new PyTorchOperatorClientV1Beta2(); return new PyTorchOperatorClientV1Beta2();
} }
default:
throw new Error(`Invalid pytorch-operator apiVersion ${operatorApiVersion}`);
} }
} }
default:
throw new Error(`Invalid operator ${kubeflowOperator}`);
} }
throw new Error(`Invalid operator ${kubeflowOperator} or apiVersion ${operatorApiVersion}`);
} }
} }
// tslint:disable: no-unsafe-any no-any completed-docs
class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient { class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
/** /**
* constructor, to initialize tfjob CRD definition * constructor, to initialize tfjob CRD definition
...@@ -73,12 +82,12 @@ class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient { ...@@ -73,12 +82,12 @@ class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
} }
protected get operator(): any { protected get operator(): any {
return this.client.apis["kubeflow.org"].v1alpha2.namespaces('default').tfjobs; return this.client.apis['kubeflow.org'].v1alpha2.namespaces('default').tfjobs;
} }
public get containerName(): string { public get containerName(): string {
return 'tensorflow'; return 'tensorflow';
} }
} }
class TFOperatorClientV1Beta1 extends KubernetesCRDClient { class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
...@@ -92,12 +101,12 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient { ...@@ -92,12 +101,12 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
} }
protected get operator(): any { protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta1.namespaces('default').tfjobs; return this.client.apis['kubeflow.org'].v1beta1.namespaces('default').tfjobs;
} }
public get containerName(): string { public get containerName(): string {
return 'tensorflow'; return 'tensorflow';
} }
} }
class TFOperatorClientV1Beta2 extends KubernetesCRDClient { class TFOperatorClientV1Beta2 extends KubernetesCRDClient {
...@@ -111,12 +120,12 @@ class TFOperatorClientV1Beta2 extends KubernetesCRDClient { ...@@ -111,12 +120,12 @@ class TFOperatorClientV1Beta2 extends KubernetesCRDClient {
} }
protected get operator(): any { protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta2.namespaces('default').tfjobs; return this.client.apis['kubeflow.org'].v1beta2.namespaces('default').tfjobs;
} }
public get containerName(): string { public get containerName(): string {
return 'tensorflow'; return 'tensorflow';
} }
} }
class PyTorchOperatorClientV1Alpha2 extends KubeflowOperatorClient { class PyTorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
...@@ -130,7 +139,7 @@ class PyTorchOperatorClientV1Alpha2 extends KubeflowOperatorClient { ...@@ -130,7 +139,7 @@ class PyTorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
} }
protected get operator(): any { protected get operator(): any {
return this.client.apis["kubeflow.org"].v1alpha2.namespaces('default').pytorchjobs; return this.client.apis['kubeflow.org'].v1alpha2.namespaces('default').pytorchjobs;
} }
public get containerName(): string { public get containerName(): string {
...@@ -149,7 +158,7 @@ class PyTorchOperatorClientV1Beta1 extends KubernetesCRDClient { ...@@ -149,7 +158,7 @@ class PyTorchOperatorClientV1Beta1 extends KubernetesCRDClient {
} }
protected get operator(): any { protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta1.namespaces('default').pytorchjobs; return this.client.apis['kubeflow.org'].v1beta1.namespaces('default').pytorchjobs;
} }
public get containerName(): string { public get containerName(): string {
...@@ -168,7 +177,7 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient { ...@@ -168,7 +177,7 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
} }
protected get operator(): any { protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta2.namespaces('default').pytorchjobs; return this.client.apis['kubeflow.org'].v1beta2.namespaces('default').pytorchjobs;
} }
public get containerName(): string { public get containerName(): string {
...@@ -176,5 +185,5 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient { ...@@ -176,5 +185,5 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
} }
} }
// tslint:enable: no-unsafe-any
export { KubeflowOperatorClient, GeneralK8sClient }; export { KubeflowOperatorClient, GeneralK8sClient };
...@@ -20,16 +20,20 @@ ...@@ -20,16 +20,20 @@
'use strict'; 'use strict';
import * as assert from 'assert'; import * as assert from 'assert';
import { KubernetesClusterConfigAzure, KubernetesClusterConfigNFS, KubernetesStorageKind, NFSConfig, AzureStorage, keyVaultConfig,
KubernetesTrialConfig, KubernetesTrialConfigTemplate, StorageConfig, KubernetesClusterConfig } from '../kubernetesConfig'
import { MethodNotImplementedError } from '../../../common/errors'; import { MethodNotImplementedError } from '../../../common/errors';
import { AzureStorage, KeyVaultConfig, KubernetesClusterConfig, KubernetesClusterConfigAzure, KubernetesClusterConfigNFS,
KubernetesStorageKind, KubernetesTrialConfig, KubernetesTrialConfigTemplate, NFSConfig, StorageConfig
} from '../kubernetesConfig';
/** operator types that kubeflow supported */ // operator types that kubeflow supported
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ; export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ;
export type DistTrainRole = 'worker' | 'ps' | 'master'; export type DistTrainRole = 'worker' | 'ps' | 'master';
export type KubeflowJobStatus = 'Created' | 'Running' | 'Failed' | 'Succeeded'; export type KubeflowJobStatus = 'Created' | 'Running' | 'Failed' | 'Succeeded';
export type OperatorApiVersion = 'v1alpha2' | 'v1beta1' | 'v1beta2'; export type OperatorApiVersion = 'v1alpha2' | 'v1beta1' | 'v1beta2';
/**
* Kubeflow Cluster Configuration
*/
export class KubeflowClusterConfig extends KubernetesClusterConfig { export class KubeflowClusterConfig extends KubernetesClusterConfig {
public readonly operator: KubeflowOperator; public readonly operator: KubeflowOperator;
constructor(apiVersion: string, operator: KubeflowOperator) { constructor(apiVersion: string, operator: KubeflowOperator) {
...@@ -38,11 +42,12 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig { ...@@ -38,11 +42,12 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig {
} }
} }
// tslint:disable:completed-docs
export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS { export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
public readonly operator: KubeflowOperator; public readonly operator: KubeflowOperator;
constructor( constructor(
operator: KubeflowOperator, operator: KubeflowOperator,
apiVersion: string, apiVersion: string,
nfs: NFSConfig, nfs: NFSConfig,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind
) { ) {
...@@ -54,9 +59,11 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS { ...@@ -54,9 +59,11 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
return 'nfs'; return 'nfs';
} }
// tslint:disable-next-line:function-name
public static getInstance(jsonObject: object): KubeflowClusterConfigNFS { public static getInstance(jsonObject: object): KubeflowClusterConfigNFS {
let kubeflowClusterConfigObjectNFS = <KubeflowClusterConfigNFS>jsonObject; const kubeflowClusterConfigObjectNFS: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>jsonObject;
assert (kubeflowClusterConfigObjectNFS !== undefined) assert (kubeflowClusterConfigObjectNFS !== undefined);
return new KubeflowClusterConfigNFS( return new KubeflowClusterConfigNFS(
kubeflowClusterConfigObjectNFS.operator, kubeflowClusterConfigObjectNFS.operator,
kubeflowClusterConfigObjectNFS.apiVersion, kubeflowClusterConfigObjectNFS.apiVersion,
...@@ -66,26 +73,28 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS { ...@@ -66,26 +73,28 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
} }
} }
export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure{ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure {
public readonly operator: KubeflowOperator; public readonly operator: KubeflowOperator;
constructor( constructor(
operator: KubeflowOperator, operator: KubeflowOperator,
apiVersion: string, apiVersion: string,
keyVault: keyVaultConfig, keyVault: KeyVaultConfig,
azureStorage: AzureStorage, azureStorage: AzureStorage,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind
) { ) {
super(apiVersion, keyVault, azureStorage,storage); super(apiVersion, keyVault, azureStorage, storage);
this.operator = operator; this.operator = operator;
} }
public get storageType(): KubernetesStorageKind{ public get storageType(): KubernetesStorageKind {
return 'azureStorage'; return 'azureStorage';
} }
// tslint:disable-next-line:function-name
public static getInstance(jsonObject: object): KubeflowClusterConfigAzure { public static getInstance(jsonObject: object): KubeflowClusterConfigAzure {
let kubeflowClusterConfigObjectAzure = <KubeflowClusterConfigAzure>jsonObject; const kubeflowClusterConfigObjectAzure: KubeflowClusterConfigAzure = <KubeflowClusterConfigAzure>jsonObject;
return new KubeflowClusterConfigAzure( return new KubeflowClusterConfigAzure(
kubeflowClusterConfigObjectAzure.operator, kubeflowClusterConfigObjectAzure.operator,
kubeflowClusterConfigObjectAzure.apiVersion, kubeflowClusterConfigObjectAzure.apiVersion,
...@@ -98,12 +107,13 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure{ ...@@ -98,12 +107,13 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure{
export class KubeflowClusterConfigFactory { export class KubeflowClusterConfigFactory {
// tslint:disable-next-line:function-name
public static generateKubeflowClusterConfig(jsonObject: object): KubeflowClusterConfig { public static generateKubeflowClusterConfig(jsonObject: object): KubeflowClusterConfig {
let storageConfig = <StorageConfig>jsonObject; const storageConfig: StorageConfig = <StorageConfig>jsonObject;
if(!storageConfig) { if (storageConfig === undefined) {
throw new Error("Invalid json object as a StorageConfig instance"); throw new Error('Invalid json object as a StorageConfig instance');
} }
if(storageConfig.storage && storageConfig.storage === 'azureStorage') { if (storageConfig.storage !== undefined && storageConfig.storage === 'azureStorage') {
return KubeflowClusterConfigAzure.getInstance(jsonObject); return KubeflowClusterConfigAzure.getInstance(jsonObject);
} else if (storageConfig.storage === undefined || storageConfig.storage === 'nfs') { } else if (storageConfig.storage === undefined || storageConfig.storage === 'nfs') {
return KubeflowClusterConfigNFS.getInstance(jsonObject); return KubeflowClusterConfigNFS.getInstance(jsonObject);
...@@ -122,10 +132,10 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig { ...@@ -122,10 +132,10 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
} }
} }
export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate{ export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate {
public readonly replicas: number; public readonly replicas: number;
constructor(replicas: number, command : string, gpuNum : number, constructor(replicas: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) { cpuNum: number, memoryMB: number, image: string) {
super(command, gpuNum, cpuNum, memoryMB, image); super(command, gpuNum, cpuNum, memoryMB, image);
this.replicas = replicas; this.replicas = replicas;
} }
...@@ -163,22 +173,25 @@ export class KubeflowTrialConfigPytorch extends KubeflowTrialConfig { ...@@ -163,22 +173,25 @@ export class KubeflowTrialConfigPytorch extends KubeflowTrialConfig {
export class KubeflowTrialConfigFactory { export class KubeflowTrialConfigFactory {
// tslint:disable-next-line:function-name
public static generateKubeflowTrialConfig(jsonObject: object, operator: KubeflowOperator): KubeflowTrialConfig { public static generateKubeflowTrialConfig(jsonObject: object, operator: KubeflowOperator): KubeflowTrialConfig {
if(operator === 'tf-operator'){ if (operator === 'tf-operator') {
let kubeflowTrialConfigObject = <KubeflowTrialConfigTensorflow>jsonObject; const kubeflowTrialConfigObject: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>jsonObject;
return new KubeflowTrialConfigTensorflow( return new KubeflowTrialConfigTensorflow(
kubeflowTrialConfigObject.codeDir, kubeflowTrialConfigObject.codeDir,
kubeflowTrialConfigObject.worker, kubeflowTrialConfigObject.worker,
kubeflowTrialConfigObject.ps kubeflowTrialConfigObject.ps
); );
}else if(operator === 'pytorch-operator'){ } else if (operator === 'pytorch-operator') {
let kubeflowTrialConfigObject = <KubeflowTrialConfigPytorch>jsonObject; const kubeflowTrialConfigObject: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>jsonObject;
return new KubeflowTrialConfigPytorch( return new KubeflowTrialConfigPytorch(
kubeflowTrialConfigObject.codeDir, kubeflowTrialConfigObject.codeDir,
kubeflowTrialConfigObject.master, kubeflowTrialConfigObject.master,
kubeflowTrialConfigObject.worker kubeflowTrialConfigObject.worker
); );
} }
throw new Error(`Invalid json object ${jsonObject}`); throw new Error(`Invalid json object ${jsonObject}`);
} }
} }
...@@ -19,65 +19,68 @@ ...@@ -19,65 +19,68 @@
'use strict'; 'use strict';
import { KubernetesTrialJobDetail} from '../kubernetesData';
import { KubernetesCRDClient } from '../kubernetesApiClient'; import { KubernetesCRDClient } from '../kubernetesApiClient';
import { KubernetesTrialJobDetail} from '../kubernetesData';
import { KubernetesJobInfoCollector } from '../kubernetesJobInfoCollector'; import { KubernetesJobInfoCollector } from '../kubernetesJobInfoCollector';
import { KubeflowJobStatus } from './kubeflowConfig'; import { KubeflowJobStatus } from './kubeflowConfig';
/** /**
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally * Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
*/ */
export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector {
constructor(jobMap: Map<string, KubernetesTrialJobDetail>) { constructor(jobMap: Map<string, KubernetesTrialJobDetail>) {
super(jobMap); super(jobMap);
} }
protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined, protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> { kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> {
if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) { if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) {
return Promise.resolve(); return Promise.resolve();
} }
if(kubernetesCRDClient === undefined) { if (kubernetesCRDClient === undefined) {
return Promise.reject('kubernetesCRDClient is undefined'); return Promise.reject('kubernetesCRDClient is undefined');
} }
// tslint:disable:no-any no-unsafe-any
let kubernetesJobInfo: any; let kubernetesJobInfo: any;
try { try {
kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName); kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName);
} catch(error) { } catch (error) {
// Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed. // Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed.
this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`); this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`);
//This is not treat as a error status //This is not treat as a error status
return Promise.resolve(); return Promise.resolve();
} }
if(kubernetesJobInfo.status && kubernetesJobInfo.status.conditions) { if (kubernetesJobInfo.status && kubernetesJobInfo.status.conditions) {
const latestCondition = kubernetesJobInfo.status.conditions[kubernetesJobInfo.status.conditions.length - 1]; const latestCondition: any = kubernetesJobInfo.status.conditions[kubernetesJobInfo.status.conditions.length - 1];
const tfJobType : KubeflowJobStatus = <KubeflowJobStatus>latestCondition.type; const tfJobType : KubeflowJobStatus = <KubeflowJobStatus>latestCondition.type;
switch(tfJobType) { switch (tfJobType) {
case 'Created': case 'Created':
kubernetesTrialJob.status = 'WAITING'; kubernetesTrialJob.status = 'WAITING';
kubernetesTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime); kubernetesTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
break; break;
case 'Running': case 'Running':
kubernetesTrialJob.status = 'RUNNING'; kubernetesTrialJob.status = 'RUNNING';
if(!kubernetesTrialJob.startTime) { if (kubernetesTrialJob.startTime === undefined) {
kubernetesTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime); kubernetesTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
} }
break; break;
case 'Failed': case 'Failed':
kubernetesTrialJob.status = 'FAILED'; kubernetesTrialJob.status = 'FAILED';
kubernetesTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime); kubernetesTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break; break;
case 'Succeeded': case 'Succeeded':
kubernetesTrialJob.status = 'SUCCEEDED'; kubernetesTrialJob.status = 'SUCCEEDED';
kubernetesTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime); kubernetesTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break; break;
default: default:
break;
} }
} }
// tslint:enable:no-any no-unsafe-any
return Promise.resolve(); return Promise.resolve();
} }
} }
\ No newline at end of file
...@@ -20,19 +20,19 @@ ...@@ -20,19 +20,19 @@
'use strict'; 'use strict';
import * as component from '../../../common/component'; import * as component from '../../../common/component';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer';
import { KubeflowTrainingService } from './kubeflowTrainingService'; import { KubeflowTrainingService } from './kubeflowTrainingService';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
/** /**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update * Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
* *
*/ */
@component.Singleton @component.Singleton
export class KubeflowJobRestServer extends KubernetesJobRestServer{ export class KubeflowJobRestServer extends KubernetesJobRestServer {
/** /**
* constructor to provide NNIRestServer's own rest property, e.g. port * constructor to provide NNIRestServer's own rest property, e.g. port
*/ */
constructor() { constructor() {
super(component.get(KubeflowTrainingService)); super(component.get(KubeflowTrainingService));
} }
} }
\ No newline at end of file
...@@ -17,35 +17,34 @@ ...@@ -17,35 +17,34 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/ */
'use strict' 'use strict';
import * as assert from 'assert'; import * as assert from 'assert';
import * as component from '../../../common/component';
import * as cpp from 'child-process-promise'; import * as cpp from 'child-process-promise';
import * as fs from 'fs'; import * as fs from 'fs';
import * as path from 'path'; import * as path from 'path';
import * as component from '../../../common/component';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
import { getExperimentId } from '../../../common/experimentStartupInfo'; import { getExperimentId } from '../../../common/experimentStartupInfo';
import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
import { import {
JobApplicationForm, TrialJobApplicationForm, JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail
TrialJobDetail, NNIManagerIpConfig
} from '../../../common/trainingService'; } from '../../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils'; import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils';
import { KubeflowClusterConfigNFS, KubeflowClusterConfigAzure, import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
KubeflowTrialConfigPytorch, KubeflowTrialConfigTensorflow, KubeflowClusterConfigFactory, KubeflowTrialConfigFactory, import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
KubeflowTrialConfig, KubeflowClusterConfig } from './kubeflowConfig';
import { NFSConfig } from '../kubernetesConfig'
import { KubernetesTrialJobDetail } from '../kubernetesData';
import { KubeflowJobRestServer } from './kubeflowJobRestServer';
import { validateCodeDir } from '../../common/util'; import { validateCodeDir } from '../../common/util';
import { AzureStorageClientUtility } from '../azureStorageClientUtils'; import { AzureStorageClientUtility } from '../azureStorageClientUtils';
import { NFSConfig } from '../kubernetesConfig';
import { KubernetesTrialJobDetail } from '../kubernetesData';
import { KubernetesTrainingService } from '../kubernetesTrainingService';
import { KubeflowOperatorClient } from './kubeflowApiClient'; import { KubeflowOperatorClient } from './kubeflowApiClient';
import { KubernetesTrainingService } from '../kubernetesTrainingService' import { KubeflowClusterConfig, KubeflowClusterConfigAzure, KubeflowClusterConfigFactory, KubeflowClusterConfigNFS,
KubeflowTrialConfig, KubeflowTrialConfigFactory, KubeflowTrialConfigPytorch, KubeflowTrialConfigTensorflow
} from './kubeflowConfig';
import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector'; import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector';
import { KubeflowJobRestServer } from './kubeflowJobRestServer';
// tslint:disable: no-unsafe-any no-any
/** /**
* Training Service implementation for Kubeflow * Training Service implementation for Kubeflow
* Refer https://github.com/kubeflow/kubeflow for more info about Kubeflow * Refer https://github.com/kubeflow/kubeflow for more info about Kubeflow
...@@ -54,12 +53,12 @@ import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector'; ...@@ -54,12 +53,12 @@ import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector';
class KubeflowTrainingService extends KubernetesTrainingService implements KubernetesTrainingService { class KubeflowTrainingService extends KubernetesTrainingService implements KubernetesTrainingService {
private kubeflowClusterConfig?: KubeflowClusterConfig; private kubeflowClusterConfig?: KubeflowClusterConfig;
private kubeflowTrialConfig?: KubeflowTrialConfig; private kubeflowTrialConfig?: KubeflowTrialConfig;
private kubeflowJobInfoCollector: KubeflowJobInfoCollector; private readonly kubeflowJobInfoCollector: KubeflowJobInfoCollector;
constructor() { constructor() {
super(); super();
this.kubeflowJobInfoCollector = new KubeflowJobInfoCollector(this.trialJobsMap); this.kubeflowJobInfoCollector = new KubeflowJobInfoCollector(this.trialJobsMap);
this.experimentId = getExperimentId(); this.experimentId = getExperimentId();
this.nextTrialSequenceId = -1; this.nextTrialSequenceId = -1;
this.log.info('Construct Kubeflow training service.'); this.log.info('Construct Kubeflow training service.');
} }
...@@ -67,17 +66,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -67,17 +66,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
public async run(): Promise<void> { public async run(): Promise<void> {
this.log.info('Run Kubeflow training service.'); this.log.info('Run Kubeflow training service.');
this.kubernetesJobRestServer = component.get(KubeflowJobRestServer); this.kubernetesJobRestServer = component.get(KubeflowJobRestServer);
if(!this.kubernetesJobRestServer) { if (this.kubernetesJobRestServer === undefined) {
throw new Error('kubernetesJobRestServer not initialized!'); throw new Error('kubernetesJobRestServer not initialized!');
} }
await this.kubernetesJobRestServer.start(); await this.kubernetesJobRestServer.start();
this.kubernetesJobRestServer.setEnableVersionCheck = this.versionCheck; this.kubernetesJobRestServer.setEnableVersionCheck = this.versionCheck;
this.log.info(`Kubeflow Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`); this.log.info(`Kubeflow Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`);
while (!this.stopping) { while (!this.stopping) {
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server // collect metrics for Kubeflow jobs by interacting with Kubernetes API server
await delay(3000); await delay(3000);
await this.kubeflowJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient); await this.kubeflowJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient);
if(this.kubernetesJobRestServer.getErrorMessage) { if (this.kubernetesJobRestServer.getErrorMessage !== undefined) {
throw new Error(this.kubernetesJobRestServer.getErrorMessage); throw new Error(this.kubernetesJobRestServer.getErrorMessage);
this.stopping = true; this.stopping = true;
} }
...@@ -86,17 +85,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -86,17 +85,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
} }
public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> { public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> {
if(!this.kubernetesCRDClient) { if (this.kubernetesCRDClient === undefined) {
throw new Error('Kubeflow job operator client is undefined'); throw new Error('Kubeflow job operator client is undefined');
} }
if(!this.kubernetesRestServerPort) { if (this.kubernetesRestServerPort === undefined) {
const restServer: KubeflowJobRestServer = component.get(KubeflowJobRestServer); const restServer: KubeflowJobRestServer = component.get(KubeflowJobRestServer);
this.kubernetesRestServerPort = restServer.clusterRestServerPort; this.kubernetesRestServerPort = restServer.clusterRestServerPort;
} }
const trialJobId: string = uniqueString(5); const trialJobId: string = uniqueString(5);
const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId); const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId);
const kubeflowJobName = `nni-exp-${this.experimentId}-trial-${trialJobId}`.toLowerCase(); const kubeflowJobName: string = `nni-exp-${this.experimentId}-trial-${trialJobId}`.toLowerCase();
const curTrialSequenceId: number = this.generateSequenceId(); const curTrialSequenceId: number = this.generateSequenceId();
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId); const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
//prepare the runscript //prepare the runscript
...@@ -113,226 +112,239 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -113,226 +112,239 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
curTrialSequenceId, curTrialSequenceId,
trialJobOutputUrl trialJobOutputUrl
); );
// Generate kubeflow job resource config object // Generate kubeflow job resource config object
const kubeflowJobConfig: any = await this.prepareKubeflowConfig(trialJobId, trialWorkingFolder, kubeflowJobName); const kubeflowJobConfig: any = await this.prepareKubeflowConfig(trialJobId, trialWorkingFolder, kubeflowJobName);
// Create kubeflow job based on generated kubeflow job resource config // Create kubeflow job based on generated kubeflow job resource config
await this.kubernetesCRDClient.createKubernetesJob(kubeflowJobConfig); await this.kubernetesCRDClient.createKubernetesJob(kubeflowJobConfig);
// Set trial job detail until create Kubeflow job successfully // Set trial job detail until create Kubeflow job successfully
this.trialJobsMap.set(trialJobId, trialJobDetail); this.trialJobsMap.set(trialJobId, trialJobDetail);
return Promise.resolve(trialJobDetail); return Promise.resolve(trialJobDetail);
} }
// tslint:disable:no-redundant-jsdoc
public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) {
case TrialConfigMetadataKey.NNI_MANAGER_IP:
this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value);
break;
case TrialConfigMetadataKey.KUBEFLOW_CLUSTER_CONFIG:
const kubeflowClusterJsonObject: object = JSON.parse(value);
this.kubeflowClusterConfig = KubeflowClusterConfigFactory.generateKubeflowClusterConfig(kubeflowClusterJsonObject);
if (this.kubeflowClusterConfig.storageType === 'azureStorage') {
const azureKubeflowClusterConfig: KubeflowClusterConfigAzure = <KubeflowClusterConfigAzure>this.kubeflowClusterConfig;
this.azureStorageAccountName = azureKubeflowClusterConfig.azureStorage.accountName;
this.azureStorageShare = azureKubeflowClusterConfig.azureStorage.azureShare;
await this.createAzureStorage(
azureKubeflowClusterConfig.keyVault.vaultName,
azureKubeflowClusterConfig.keyVault.name,
azureKubeflowClusterConfig.azureStorage.accountName,
azureKubeflowClusterConfig.azureStorage.azureShare
);
} else if (this.kubeflowClusterConfig.storageType === 'nfs') {
const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig;
await this.createNFSStorage(
nfsKubeflowClusterConfig.nfs.server,
nfsKubeflowClusterConfig.nfs.path
);
}
this.kubernetesCRDClient = KubeflowOperatorClient.generateOperatorClient(this.kubeflowClusterConfig.operator,
this.kubeflowClusterConfig.apiVersion);
break;
case TrialConfigMetadataKey.TRIAL_CONFIG:
if (this.kubeflowClusterConfig === undefined) {
this.log.error('kubeflow cluster config is not initialized');
return Promise.reject(new Error('kubeflow cluster config is not initialized'));
}
assert(this.kubeflowClusterConfig !== undefined);
const kubeflowTrialJsonObjsect: object = JSON.parse(value);
this.kubeflowTrialConfig = KubeflowTrialConfigFactory.generateKubeflowTrialConfig(
kubeflowTrialJsonObjsect,
this.kubeflowClusterConfig.operator
);
// Validate to make sure codeDir doesn't have too many files
try {
await validateCodeDir(this.kubeflowTrialConfig.codeDir);
} catch (error) {
this.log.error(error);
return Promise.reject(new Error(error));
}
break;
case TrialConfigMetadataKey.VERSION_CHECK:
this.versionCheck = (value === 'true' || value === 'True');
break;
case TrialConfigMetadataKey.LOG_COLLECTION:
this.logCollection = value;
break;
default:
}
return Promise.resolve();
}
/** /**
* upload code files to nfs or azureStroage * upload code files to nfs or azureStroage
* @param trialJobId * @param trialJobId
* @param trialLocalTempFolder * @param trialLocalTempFolder
* return: trialJobOutputUrl * return: trialJobOutputUrl
*/ */
private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise<string> { private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise<string> {
if(!this.kubeflowClusterConfig) { if (this.kubeflowClusterConfig === undefined) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
let trialJobOutputUrl: string = ''; let trialJobOutputUrl: string = '';
assert(!this.kubeflowClusterConfig.storage assert(this.kubeflowClusterConfig.storage === undefined
|| this.kubeflowClusterConfig.storage === 'azureStorage' || this.kubeflowClusterConfig.storage === 'azureStorage'
|| this.kubeflowClusterConfig.storage === 'nfs'); || this.kubeflowClusterConfig.storage === 'nfs');
if(this.kubeflowClusterConfig.storage === 'azureStorage') { if (this.kubeflowClusterConfig.storage === 'azureStorage') {
try{ if (this.azureStorageClient === undefined) {
throw new Error('azureStorageClient is not initialized');
}
try {
//upload local files to azure storage //upload local files to azure storage
await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`); `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare,
`${trialLocalTempFolder}`);
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}` trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}\
}catch(error){ /${path.join('nni', getExperimentId(), trialJobId, 'output')}`;
} catch (error) {
this.log.error(error); this.log.error(error);
return Promise.reject(error); return Promise.reject(error);
} }
} else if(this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) { } else if (this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) {
let nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig; const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig;
// Creat work dir for current trial in NFS directory // Creat work dir for current trial in NFS directory
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`); await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`);
// Copy code files from local dir to NFS mounted dir // Copy code files from local dir to NFS mounted dir
await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`); await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
const nfsConfig: NFSConfig = nfsKubeflowClusterConfig.nfs; const nfsConfig: NFSConfig = nfsKubeflowClusterConfig.nfs;
trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}` trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`;
} }
return Promise.resolve(trialJobOutputUrl); return Promise.resolve(trialJobOutputUrl);
} }
private async prepareRunScript(trialLocalTempFolder: string, trialJobId: string, trialWorkingFolder: string, curTrialSequenceId: number, form: JobApplicationForm): Promise<void> { private async prepareRunScript(trialLocalTempFolder: string, trialJobId: string, trialWorkingFolder: string, curTrialSequenceId: number,
if(!this.kubeflowClusterConfig) { form: JobApplicationForm): Promise<void> {
if (this.kubeflowClusterConfig === undefined) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
// initialize kubeflow trial config to specific type // initialize kubeflow trial config to specific type
let kubeflowTrialConfig; let kubeflowTrialConfig: any;
if(this.kubeflowClusterConfig.operator === 'tf-operator') { if (this.kubeflowClusterConfig.operator === 'tf-operator') {
kubeflowTrialConfig = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig; kubeflowTrialConfig = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
}else if(this.kubeflowClusterConfig.operator === 'pytorch-operator'){ } else if (this.kubeflowClusterConfig.operator === 'pytorch-operator') {
kubeflowTrialConfig = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig; kubeflowTrialConfig = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
}else { } else {
throw Error(`operator ${this.kubeflowClusterConfig.operator} is invalid`) throw Error(`operator ${this.kubeflowClusterConfig.operator} is invalid`);
} }
//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${kubeflowTrialConfig.codeDir} ${trialLocalTempFolder}`);
const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });
// Create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
// Write worker file content run_worker.sh to local tmp folders
if(kubeflowTrialConfig.worker) {
const workerRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder,
kubeflowTrialConfig.worker.command, curTrialSequenceId.toString(), 'worker', kubeflowTrialConfig.worker.gpuNum);
//create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${kubeflowTrialConfig.codeDir} ${trialLocalTempFolder}`);
const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });
// Create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
// Write worker file content run_worker.sh to local tmp folders
if (kubeflowTrialConfig.worker !== undefined) {
const workerRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder,
kubeflowTrialConfig.worker.command,
curTrialSequenceId.toString(), 'worker',
kubeflowTrialConfig.worker.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_worker.sh'), workerRunScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_worker.sh'), workerRunScriptContent, { encoding: 'utf8' });
} }
// Write parameter server file content run_ps.sh to local tmp folders // Write parameter server file content run_ps.sh to local tmp folders
if(this.kubeflowClusterConfig.operator === 'tf-operator') { if (this.kubeflowClusterConfig.operator === 'tf-operator') {
let tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig; const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
if(tensorflowTrialConfig.ps){ if (tensorflowTrialConfig.ps !== undefined) {
const psRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder, const psRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder,
tensorflowTrialConfig.ps.command, curTrialSequenceId.toString(), 'ps', tensorflowTrialConfig.ps.gpuNum); tensorflowTrialConfig.ps.command,
curTrialSequenceId.toString(),
'ps', tensorflowTrialConfig.ps.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_ps.sh'), psRunScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_ps.sh'), psRunScriptContent, { encoding: 'utf8' });
} }
} } else if (this.kubeflowClusterConfig.operator === 'pytorch-operator') {
else if(this.kubeflowClusterConfig.operator === 'pytorch-operator') { const pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
let pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig; if (pytorchTrialConfig.master !== undefined) {
if(pytorchTrialConfig.master){ const masterRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder,
const masterRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder, pytorchTrialConfig.master.command,
pytorchTrialConfig.master.command, curTrialSequenceId.toString(), 'master', pytorchTrialConfig.master.gpuNum); curTrialSequenceId.toString(), 'master',
pytorchTrialConfig.master.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_master.sh'), masterRunScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_master.sh'), masterRunScriptContent, { encoding: 'utf8' });
} }
} }
// Write file content ( parameter.cfg ) to local tmp folders // Write file content ( parameter.cfg ) to local tmp folders
const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form) const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form);
if(trialForm && trialForm.hyperParameters) { if (trialForm !== undefined && trialForm.hyperParameters !== undefined) {
await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)), await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)),
trialForm.hyperParameters.value, { encoding: 'utf8' }); trialForm.hyperParameters.value, { encoding: 'utf8' });
} }
} }
private async prepareKubeflowConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName: string): Promise<any> { private async prepareKubeflowConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName: string): Promise<any> {
if(!this.kubeflowClusterConfig) { if (this.kubeflowClusterConfig === undefined) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
if(!this.kubeflowTrialConfig) { if (this.kubeflowTrialConfig === undefined) {
throw new Error('Kubeflow trial config is not initialized'); throw new Error('Kubeflow trial config is not initialized');
} }
// initialize kubeflow trial config to specific type // initialize kubeflow trial config to specific type
let kubeflowTrialConfig; let kubeflowTrialConfig: any;
if(this.kubeflowClusterConfig.operator === 'tf-operator') { if (this.kubeflowClusterConfig.operator === 'tf-operator') {
kubeflowTrialConfig = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig; kubeflowTrialConfig = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
}else if(this.kubeflowClusterConfig.operator === 'pytorch-operator'){ } else if (this.kubeflowClusterConfig.operator === 'pytorch-operator') {
kubeflowTrialConfig = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig; kubeflowTrialConfig = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
}else { } else {
throw Error(`operator ${this.kubeflowClusterConfig.operator} is invalid`) throw Error(`operator ${this.kubeflowClusterConfig.operator} is invalid`);
} }
const workerPodResources : any = {}; const workerPodResources : any = {};
if(kubeflowTrialConfig.worker) { if (kubeflowTrialConfig.worker !== undefined) {
workerPodResources.requests = this.generatePodResource(kubeflowTrialConfig.worker.memoryMB, kubeflowTrialConfig.worker.cpuNum, workerPodResources.requests = this.generatePodResource(kubeflowTrialConfig.worker.memoryMB, kubeflowTrialConfig.worker.cpuNum,
kubeflowTrialConfig.worker.gpuNum) kubeflowTrialConfig.worker.gpuNum);
} }
workerPodResources.limits = Object.assign({}, workerPodResources.requests); workerPodResources.limits = {...workerPodResources.requests};
let nonWorkerResources : any = {}; const nonWorkerResources : any = {};
if(this.kubeflowClusterConfig.operator === 'tf-operator') { if (this.kubeflowClusterConfig.operator === 'tf-operator') {
let tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig; const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
if (tensorflowTrialConfig.ps) { if (tensorflowTrialConfig.ps !== undefined) {
nonWorkerResources.requests = this.generatePodResource(tensorflowTrialConfig.ps.memoryMB, tensorflowTrialConfig.ps.cpuNum, nonWorkerResources.requests = this.generatePodResource(tensorflowTrialConfig.ps.memoryMB, tensorflowTrialConfig.ps.cpuNum,
tensorflowTrialConfig.ps.gpuNum) tensorflowTrialConfig.ps.gpuNum);
nonWorkerResources.limits = Object.assign({}, nonWorkerResources.requests); nonWorkerResources.limits = {...nonWorkerResources.requests};
} }
}else if(this.kubeflowClusterConfig.operator === 'pytorch-operator'){ } else if (this.kubeflowClusterConfig.operator === 'pytorch-operator') {
let pyTorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig; const pyTorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
nonWorkerResources.requests = this.generatePodResource(pyTorchTrialConfig.master.memoryMB, pyTorchTrialConfig.master.cpuNum, nonWorkerResources.requests = this.generatePodResource(pyTorchTrialConfig.master.memoryMB, pyTorchTrialConfig.master.cpuNum,
pyTorchTrialConfig.master.gpuNum) pyTorchTrialConfig.master.gpuNum);
nonWorkerResources.limits = Object.assign({}, nonWorkerResources.requests); nonWorkerResources.limits = {...nonWorkerResources.requests};
}
}
// Generate kubeflow job resource config object // Generate kubeflow job resource config object
const kubeflowJobConfig: any = this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources, nonWorkerResources); const kubeflowJobConfig: any = this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources,
nonWorkerResources);
return Promise.resolve(kubeflowJobConfig); return Promise.resolve(kubeflowJobConfig);
}
public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) {
case TrialConfigMetadataKey.NNI_MANAGER_IP:
this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value);
break;
case TrialConfigMetadataKey.KUBEFLOW_CLUSTER_CONFIG:
let kubeflowClusterJsonObject = JSON.parse(value);
this.kubeflowClusterConfig = KubeflowClusterConfigFactory.generateKubeflowClusterConfig(kubeflowClusterJsonObject);
if(this.kubeflowClusterConfig.storageType === 'azureStorage') {
let azureKubeflowClusterConfig = <KubeflowClusterConfigAzure>this.kubeflowClusterConfig;
this.azureStorageAccountName = azureKubeflowClusterConfig.azureStorage.accountName;
this.azureStorageShare = azureKubeflowClusterConfig.azureStorage.azureShare;
await this.createAzureStorage(
azureKubeflowClusterConfig.keyVault.vaultName,
azureKubeflowClusterConfig.keyVault.name,
azureKubeflowClusterConfig.azureStorage.accountName,
azureKubeflowClusterConfig.azureStorage.azureShare
);
} else if(this.kubeflowClusterConfig.storageType === 'nfs') {
let nfsKubeflowClusterConfig = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig;
await this.createNFSStorage(
nfsKubeflowClusterConfig.nfs.server,
nfsKubeflowClusterConfig.nfs.path
);
}
this.kubernetesCRDClient = KubeflowOperatorClient.generateOperatorClient(this.kubeflowClusterConfig.operator,
this.kubeflowClusterConfig.apiVersion);
break;
case TrialConfigMetadataKey.TRIAL_CONFIG:
if (!this.kubeflowClusterConfig){
this.log.error('kubeflow cluster config is not initialized');
return Promise.reject(new Error('kubeflow cluster config is not initialized'));
}
assert(this.kubeflowClusterConfig !== undefined)
let kubeflowTrialJsonObjsect = JSON.parse(value);
this.kubeflowTrialConfig = KubeflowTrialConfigFactory.generateKubeflowTrialConfig(
kubeflowTrialJsonObjsect,
this.kubeflowClusterConfig.operator
);
// Validate to make sure codeDir doesn't have too many files
try {
await validateCodeDir(this.kubeflowTrialConfig.codeDir);
} catch(error) {
this.log.error(error);
return Promise.reject(new Error(error));
}
break;
case TrialConfigMetadataKey.VERSION_CHECK:
this.versionCheck = (value === 'true' || value === 'True');
break;
case TrialConfigMetadataKey.LOG_COLLECTION:
this.logCollection = value;
break;
default:
break;
}
return Promise.resolve();
} }
/** /**
...@@ -343,49 +355,48 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -343,49 +355,48 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param workerPodResources worker pod template * @param workerPodResources worker pod template
* @param nonWorkerPodResources non-worker pod template, like ps or master * @param nonWorkerPodResources non-worker pod template, like ps or master
*/ */
private generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName : string, workerPodResources : any, nonWorkerPodResources?: any) : any { private generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName : string, workerPodResources : any,
if(!this.kubeflowClusterConfig) { nonWorkerPodResources?: any) : any {
if (this.kubeflowClusterConfig === undefined) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
if(!this.kubeflowTrialConfig) { if (this.kubeflowTrialConfig === undefined) {
throw new Error('Kubeflow trial config is not initialized'); throw new Error('Kubeflow trial config is not initialized');
} }
if(!this.kubernetesCRDClient) { if (this.kubernetesCRDClient === undefined) {
throw new Error('Kubeflow operator client is not initialized'); throw new Error('Kubeflow operator client is not initialized');
} }
const replicaSpecsObj: any = {}; const replicaSpecsObj: any = {};
let replicaSpecsObjMap = new Map<string, object>(); const replicaSpecsObjMap: Map<string, object> = new Map<string, object>();
if(this.kubeflowTrialConfig.operatorType === 'tf-operator') { if (this.kubeflowTrialConfig.operatorType === 'tf-operator') {
let tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig; const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas, replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas,
tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources); tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources);
if (tensorflowTrialConfig.ps !== undefined) {
if (tensorflowTrialConfig.ps){ replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas,
replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas, tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources);
tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources);
} }
replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {'tfReplicaSpecs': replicaSpecsObj}) replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {tfReplicaSpecs: replicaSpecsObj});
} } else if (this.kubeflowTrialConfig.operatorType === 'pytorch-operator') {
else if(this.kubeflowTrialConfig.operatorType === 'pytorch-operator') { const pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
let pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig; if (pytorchTrialConfig.worker !== undefined) {
if(pytorchTrialConfig.worker) { replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas,
replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas, pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources);
pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources);
} }
replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas, replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas,
pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources); pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources);
replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {'pytorchReplicaSpecs': replicaSpecsObj}) replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {pytorchReplicaSpecs: replicaSpecsObj});
} }
return { return {
apiVersion: `kubeflow.org/${this.kubernetesCRDClient.apiVersion}`, apiVersion: `kubeflow.org/${this.kubernetesCRDClient.apiVersion}`,
kind: this.kubernetesCRDClient.jobKind, kind: this.kubernetesCRDClient.jobKind,
metadata: { metadata: {
name: kubeflowJobName, name: kubeflowJobName,
namespace: 'default', namespace: 'default',
labels: { labels: {
...@@ -395,7 +406,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -395,7 +406,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
} }
}, },
spec: replicaSpecsObjMap.get(this.kubernetesCRDClient.jobKind) spec: replicaSpecsObjMap.get(this.kubernetesCRDClient.jobKind)
}; };
} }
/** /**
...@@ -406,21 +417,22 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -406,21 +417,22 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param runScriptFile script file name * @param runScriptFile script file name
* @param podResources pod resource config section * @param podResources pod resource config section
*/ */
private generateReplicaConfig(trialWorkingFolder: string, replicaNumber: number, replicaImage: string, runScriptFile: string, podResources: any): any { private generateReplicaConfig(trialWorkingFolder: string, replicaNumber: number, replicaImage: string, runScriptFile: string,
if(!this.kubeflowClusterConfig) { podResources: any): any {
if (this.kubeflowClusterConfig === undefined) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
if(!this.kubeflowTrialConfig) { if (this.kubeflowTrialConfig === undefined) {
throw new Error('Kubeflow trial config is not initialized'); throw new Error('Kubeflow trial config is not initialized');
} }
if(!this.kubernetesCRDClient) { if (this.kubernetesCRDClient === undefined) {
throw new Error('Kubeflow operator client is not initialized'); throw new Error('Kubeflow operator client is not initialized');
} }
let volumeSpecMap = new Map<string, object>(); const volumeSpecMap: Map<string, object> = new Map<string, object>();
if(this.kubeflowClusterConfig.storageType === 'azureStorage'){ if (this.kubeflowClusterConfig.storageType === 'azureStorage') {
volumeSpecMap.set('nniVolumes', [ volumeSpecMap.set('nniVolumes', [
{ {
name: 'nni-vol', name: 'nni-vol',
...@@ -429,9 +441,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -429,9 +441,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
shareName: `${this.azureStorageShare}`, shareName: `${this.azureStorageShare}`,
readonly: false readonly: false
} }
}]) }]);
}else { } else {
let nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS> this.kubeflowClusterConfig; const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS> this.kubeflowClusterConfig;
volumeSpecMap.set('nniVolumes', [ volumeSpecMap.set('nniVolumes', [
{ {
name: 'nni-vol', name: 'nni-vol',
...@@ -439,13 +451,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -439,13 +451,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
server: `${nfsKubeflowClusterConfig.nfs.server}`, server: `${nfsKubeflowClusterConfig.nfs.server}`,
path: `${nfsKubeflowClusterConfig.nfs.path}` path: `${nfsKubeflowClusterConfig.nfs.path}`
} }
}]) }]);
} }
return { return {
replicas: replicaNumber, replicas: replicaNumber,
template: { template: {
metadata: { metadata: {
// tslint:disable-next-line:no-null-keyword
creationTimestamp: null creationTimestamp: null
}, },
spec: { spec: {
...@@ -455,7 +468,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -455,7 +468,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// TODO: change the name based on operator's type // TODO: change the name based on operator's type
name: this.kubernetesCRDClient.containerName, name: this.kubernetesCRDClient.containerName,
image: replicaImage, image: replicaImage,
args: ["sh", `${path.join(trialWorkingFolder, runScriptFile)}`], args: ['sh', `${path.join(trialWorkingFolder, runScriptFile)}`],
volumeMounts: [ volumeMounts: [
{ {
name: 'nni-vol', name: 'nni-vol',
...@@ -470,5 +483,5 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -470,5 +483,5 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}; };
} }
} }
// tslint:enable: no-unsafe-any no-any
export { KubeflowTrainingService } export { KubeflowTrainingService };
...@@ -19,44 +19,46 @@ ...@@ -19,44 +19,46 @@
'use strict'; 'use strict';
import * as os from 'os' import { Client1_10, config } from 'kubernetes-client';
import * as path from 'path';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
var K8SClient = require('kubernetes-client').Client;
var K8SConfig = require('kubernetes-client').config;
/** /**
* Generict Kubernetes client, target version >= 1.9 * Generict Kubernetes client, target version >= 1.9
*/ */
// tslint:disable: no-any no-unsafe-any
class GeneralK8sClient { class GeneralK8sClient {
protected readonly client: any; protected readonly client: any;
protected readonly log: Logger = getLogger(); protected readonly log: Logger = getLogger();
constructor() { constructor() {
this.client = new K8SClient({ config: K8SConfig.fromKubeconfig(), version: '1.9'}); this.client = new Client1_10({ config: config.fromKubeconfig(), version: '1.9'});
this.client.loadSpec(); this.client.loadSpec();
} }
public async createSecret(secretManifest: any): Promise<boolean> { public async createSecret(secretManifest: any): Promise<boolean> {
let result: Promise<boolean>; let result: Promise<boolean>;
const response : any = await this.client.api.v1.namespaces('default').secrets.post({body: secretManifest}); const response : any = await this.client.api.v1.namespaces('default').secrets
if(response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) { .post({body: secretManifest});
if (response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) {
result = Promise.resolve(true); result = Promise.resolve(true);
} else { } else {
result = Promise.reject(`Create secrets failed, statusCode is ${response.statusCode}`); result = Promise.reject(`Create secrets failed, statusCode is ${response.statusCode}`);
} }
return result; return result;
} }
} }
/**
* Kubernetes CRD client
*/
abstract class KubernetesCRDClient { abstract class KubernetesCRDClient {
protected readonly client: any; protected readonly client: any;
protected readonly log: Logger = getLogger(); protected readonly log: Logger = getLogger();
protected crdSchema: any; protected crdSchema: any;
constructor() { constructor() {
this.client = new K8SClient({ config: K8SConfig.fromKubeconfig() }); this.client = new Client1_10({ config: config.fromKubeconfig() });
this.client.loadSpec(); this.client.loadSpec();
} }
...@@ -65,8 +67,8 @@ abstract class KubernetesCRDClient { ...@@ -65,8 +67,8 @@ abstract class KubernetesCRDClient {
public abstract get containerName(): string; public abstract get containerName(): string;
public get jobKind(): string { public get jobKind(): string {
if(this.crdSchema if (this.crdSchema
&& this.crdSchema.spec && this.crdSchema.spec
&& this.crdSchema.spec.names && this.crdSchema.spec.names
&& this.crdSchema.spec.names.kind) { && this.crdSchema.spec.names.kind) {
return this.crdSchema.spec.names.kind; return this.crdSchema.spec.names.kind;
...@@ -76,55 +78,62 @@ abstract class KubernetesCRDClient { ...@@ -76,55 +78,62 @@ abstract class KubernetesCRDClient {
} }
public get apiVersion(): string { public get apiVersion(): string {
if(this.crdSchema if (this.crdSchema
&& this.crdSchema.spec && this.crdSchema.spec
&& this.crdSchema.spec.version) { && this.crdSchema.spec.version) {
return this.crdSchema.spec.version; return this.crdSchema.spec.version;
} else { } else {
throw new Error('KubeflowOperatorClient: get apiVersion failed, version is undefined in crd schema!'); throw new Error('KubeflowOperatorClient: get apiVersion failed, version is undefined in crd schema!');
} }
} }
public async createKubernetesJob(jobManifest: any): Promise<boolean> { public async createKubernetesJob(jobManifest: any): Promise<boolean> {
let result: Promise<boolean>; let result: Promise<boolean>;
const response : any = await this.operator.post({body: jobManifest}); const response : any = await this.operator.post({body: jobManifest});
if(response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) { if (response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) {
result = Promise.resolve(true); result = Promise.resolve(true);
} else { } else {
result = Promise.reject(`Create kubernetes job failed, statusCode is ${response.statusCode}`); result = Promise.reject(`Create kubernetes job failed, statusCode is ${response.statusCode}`);
} }
return result; return result;
} }
//TODO : replace any //TODO : replace any
public async getKubernetesJob(kubeflowJobName: string): Promise<any> { public async getKubernetesJob(kubeflowJobName: string): Promise<any> {
let result: Promise<any>; let result: Promise<any>;
const response : any = await this.operator(kubeflowJobName).get(); const response : any = await this.operator(kubeflowJobName)
if(response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) { .get();
if (response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) {
result = Promise.resolve(response.body); result = Promise.resolve(response.body);
} else { } else {
result = Promise.reject(`KubeflowOperatorClient get tfjobs failed, statusCode is ${response.statusCode}`); result = Promise.reject(`KubeflowOperatorClient get tfjobs failed, statusCode is ${response.statusCode}`);
} }
return result; return result;
} }
public async deleteKubernetesJob(labels: Map<string, string>): Promise<boolean> { public async deleteKubernetesJob(labels: Map<string, string>): Promise<boolean> {
let result: Promise<boolean>; let result: Promise<boolean>;
// construct match query from labels for deleting tfjob // construct match query from labels for deleting tfjob
const matchQuery: string = Array.from(labels.keys()).map(labelKey => `${labelKey}=${labels.get(labelKey)}`).join(','); const matchQuery: string = Array.from(labels.keys())
.map((labelKey: string) => `${labelKey}=${labels.get(labelKey)}`)
.join(',');
try { try {
const deleteResult : any = await this.operator().delete({ const deleteResult : any = await this.operator()
.delete({
qs: { qs: {
labelSelector: matchQuery, labelSelector: matchQuery,
propagationPolicy: "Background" propagationPolicy: 'Background'
} }
}); });
if(deleteResult.statusCode && deleteResult.statusCode >= 200 && deleteResult.statusCode <= 299) { if (deleteResult.statusCode && deleteResult.statusCode >= 200 && deleteResult.statusCode <= 299) {
result = Promise.resolve(true); result = Promise.resolve(true);
} else { } else {
result = Promise.reject(`KubeflowOperatorClient, delete labels ${matchQuery} get wrong statusCode ${deleteResult.statusCode}`); result = Promise.reject(
`KubeflowOperatorClient, delete labels ${matchQuery} get wrong statusCode ${deleteResult.statusCode}`);
} }
} catch(err) { } catch (err) {
result = Promise.reject(err); result = Promise.reject(err);
} }
......
...@@ -22,16 +22,17 @@ ...@@ -22,16 +22,17 @@
export type KubernetesStorageKind = 'nfs' | 'azureStorage'; export type KubernetesStorageKind = 'nfs' | 'azureStorage';
import { MethodNotImplementedError } from '../../common/errors'; import { MethodNotImplementedError } from '../../common/errors';
// tslint:disable: completed-docs function-name
export abstract class KubernetesClusterConfig { export abstract class KubernetesClusterConfig {
public readonly storage?: KubernetesStorageKind; public readonly storage?: KubernetesStorageKind;
public readonly apiVersion: string; public readonly apiVersion: string;
constructor(apiVersion: string, storage?: KubernetesStorageKind) { constructor(apiVersion: string, storage?: KubernetesStorageKind) {
this.storage = storage; this.storage = storage;
this.apiVersion = apiVersion; this.apiVersion = apiVersion;
} }
public get storageType(): KubernetesStorageKind{ public get storageType(): KubernetesStorageKind {
throw new MethodNotImplementedError(); throw new MethodNotImplementedError();
} }
} }
...@@ -48,7 +49,7 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig { ...@@ -48,7 +49,7 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
public readonly nfs: NFSConfig; public readonly nfs: NFSConfig;
constructor( constructor(
apiVersion: string, apiVersion: string,
nfs: NFSConfig, nfs: NFSConfig,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind
) { ) {
...@@ -56,12 +57,13 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig { ...@@ -56,12 +57,13 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
this.nfs = nfs; this.nfs = nfs;
} }
public get storageType(): KubernetesStorageKind{ public get storageType(): KubernetesStorageKind {
return 'nfs'; return 'nfs';
} }
public static getInstance(jsonObject: object): KubernetesClusterConfigNFS { public static getInstance(jsonObject: object): KubernetesClusterConfigNFS {
let kubernetesClusterConfigObjectNFS = <KubernetesClusterConfigNFS>jsonObject; const kubernetesClusterConfigObjectNFS: KubernetesClusterConfigNFS = <KubernetesClusterConfigNFS>jsonObject;
return new KubernetesClusterConfigNFS( return new KubernetesClusterConfigNFS(
kubernetesClusterConfigObjectNFS.apiVersion, kubernetesClusterConfigObjectNFS.apiVersion,
kubernetesClusterConfigObjectNFS.nfs, kubernetesClusterConfigObjectNFS.nfs,
...@@ -71,13 +73,13 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig { ...@@ -71,13 +73,13 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
} }
export class KubernetesClusterConfigAzure extends KubernetesClusterConfig { export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
public readonly keyVault: keyVaultConfig; public readonly keyVault: KeyVaultConfig;
public readonly azureStorage: AzureStorage; public readonly azureStorage: AzureStorage;
constructor( constructor(
apiVersion: string, apiVersion: string,
keyVault: keyVaultConfig, keyVault: KeyVaultConfig,
azureStorage: AzureStorage, azureStorage: AzureStorage,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind
) { ) {
super(apiVersion, storage); super(apiVersion, storage);
...@@ -85,12 +87,13 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig { ...@@ -85,12 +87,13 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
this.azureStorage = azureStorage; this.azureStorage = azureStorage;
} }
public get storageType(): KubernetesStorageKind{ public get storageType(): KubernetesStorageKind {
return 'azureStorage'; return 'azureStorage';
} }
public static getInstance(jsonObject: object): KubernetesClusterConfigAzure { public static getInstance(jsonObject: object): KubernetesClusterConfigAzure {
let kubernetesClusterConfigObjectAzure = <KubernetesClusterConfigAzure>jsonObject; const kubernetesClusterConfigObjectAzure: KubernetesClusterConfigAzure = <KubernetesClusterConfigAzure>jsonObject;
return new KubernetesClusterConfigAzure( return new KubernetesClusterConfigAzure(
kubernetesClusterConfigObjectAzure.apiVersion, kubernetesClusterConfigObjectAzure.apiVersion,
kubernetesClusterConfigObjectAzure.keyVault, kubernetesClusterConfigObjectAzure.keyVault,
...@@ -100,17 +103,20 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig { ...@@ -100,17 +103,20 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
} }
} }
// tslint:disable-next-line:no-unnecessary-class
export class KubernetesClusterConfigFactory { export class KubernetesClusterConfigFactory {
public static generateKubernetesClusterConfig(jsonObject: object): KubernetesClusterConfig { public static generateKubernetesClusterConfig(jsonObject: object): KubernetesClusterConfig {
let storageConfig = <StorageConfig>jsonObject; const storageConfig: StorageConfig = <StorageConfig>jsonObject;
switch(storageConfig.storage) { switch (storageConfig.storage) {
case 'azureStorage': case 'azureStorage':
return KubernetesClusterConfigAzure.getInstance(jsonObject); return KubernetesClusterConfigAzure.getInstance(jsonObject);
case 'nfs' || undefined : case 'nfs':
case undefined:
return KubernetesClusterConfigNFS.getInstance(jsonObject); return KubernetesClusterConfigNFS.getInstance(jsonObject);
default:
throw new Error(`Invalid json object ${jsonObject}`);
} }
throw new Error(`Invalid json object ${jsonObject}`);
} }
} }
...@@ -118,9 +124,9 @@ export class KubernetesClusterConfigFactory { ...@@ -118,9 +124,9 @@ export class KubernetesClusterConfigFactory {
* NFS configuration to store Kubeflow job related files * NFS configuration to store Kubeflow job related files
*/ */
export class NFSConfig { export class NFSConfig {
/** IP Adress of NFS server */ // IP Adress of NFS server
public readonly server : string; public readonly server : string;
/** exported NFS path on NFS server */ // exported NFS path on NFS server
public readonly path : string; public readonly path : string;
constructor(server : string, path : string) { constructor(server : string, path : string) {
...@@ -133,13 +139,13 @@ export class NFSConfig { ...@@ -133,13 +139,13 @@ export class NFSConfig {
* KeyVault configuration to store the key of Azure Storage Service * KeyVault configuration to store the key of Azure Storage Service
* Refer https://docs.microsoft.com/en-us/azure/key-vault/key-vault-manage-with-cli2 * Refer https://docs.microsoft.com/en-us/azure/key-vault/key-vault-manage-with-cli2
*/ */
export class keyVaultConfig { export class KeyVaultConfig {
/**The vault-name to specify vault */ // The vault-name to specify vault
public readonly vaultName : string; public readonly vaultName : string;
/**The name to specify private key */ // The name to specify private key
public readonly name : string; public readonly name : string;
constructor(vaultName : string, name : string){ constructor(vaultName : string, name : string) {
this.vaultName = vaultName; this.vaultName = vaultName;
this.name = name; this.name = name;
} }
...@@ -149,12 +155,12 @@ export class keyVaultConfig { ...@@ -149,12 +155,12 @@ export class keyVaultConfig {
* Azure Storage Service * Azure Storage Service
*/ */
export class AzureStorage { export class AzureStorage {
/**The azure share to storage files */ // The azure share to storage files
public readonly azureShare : string; public readonly azureShare : string;
/**The account name of sotrage service */ // The account name of sotrage service
public readonly accountName: string; public readonly accountName: string;
constructor(azureShare : string, accountName: string){ constructor(azureShare : string, accountName: string) {
this.azureShare = azureShare; this.azureShare = azureShare;
this.accountName = accountName; this.accountName = accountName;
} }
...@@ -164,23 +170,23 @@ export class AzureStorage { ...@@ -164,23 +170,23 @@ export class AzureStorage {
* Trial job configuration for Kubernetes * Trial job configuration for Kubernetes
*/ */
export class KubernetesTrialConfigTemplate { export class KubernetesTrialConfigTemplate {
/** CPU number */ // CPU number
public readonly cpuNum: number; public readonly cpuNum: number;
/** Memory */ // Memory
public readonly memoryMB: number; public readonly memoryMB: number;
/** Docker image */ // Docker image
public readonly image: string; public readonly image: string;
/** Trail command */ // Trail command
public readonly command : string; public readonly command : string;
/** Required GPU number for trial job. The number should be in [0,100] */ // Required GPU number for trial job. The number should be in [0,100]
public readonly gpuNum : number; public readonly gpuNum : number;
constructor(command : string, gpuNum : number, constructor(command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) { cpuNum: number, memoryMB: number, image: string) {
this.command = command; this.command = command;
this.gpuNum = gpuNum; this.gpuNum = gpuNum;
this.cpuNum = cpuNum; this.cpuNum = cpuNum;
...@@ -195,4 +201,4 @@ export class KubernetesTrialConfig { ...@@ -195,4 +201,4 @@ export class KubernetesTrialConfig {
constructor(codeDir: string) { constructor(codeDir: string) {
this.codeDir = codeDir; this.codeDir = codeDir;
} }
} }
\ No newline at end of file
...@@ -24,7 +24,6 @@ import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../commo ...@@ -24,7 +24,6 @@ import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../commo
/** /**
* KubeflowTrialJobDetail * KubeflowTrialJobDetail
*/ */
// tslint:disable-next-line:max-classes-per-file
export class KubernetesTrialJobDetail implements TrialJobDetail { export class KubernetesTrialJobDetail implements TrialJobDetail {
public id: string; public id: string;
public status: TrialJobStatus; public status: TrialJobStatus;
...@@ -40,7 +39,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail { ...@@ -40,7 +39,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
public queryJobFailedCount: number; public queryJobFailedCount: number;
constructor(id: string, status: TrialJobStatus, submitTime: number, constructor(id: string, status: TrialJobStatus, submitTime: number,
workingDirectory: string, form: JobApplicationForm, workingDirectory: string, form: JobApplicationForm,
kubernetesJobName: string, sequenceId: number, url: string) { kubernetesJobName: string, sequenceId: number, url: string) {
this.id = id; this.id = id;
this.status = status; this.status = status;
...@@ -55,7 +54,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail { ...@@ -55,7 +54,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
} }
} }
export const KubernetesScriptFormat = export const kubernetesScriptFormat: string =
`#!/bin/bash `#!/bin/bash
export NNI_PLATFORM={0} export NNI_PLATFORM={0}
export NNI_SYS_DIR=$PWD/nni/{1} export NNI_SYS_DIR=$PWD/nni/{1}
...@@ -71,5 +70,5 @@ mkdir -p $NNI_OUTPUT_DIR ...@@ -71,5 +70,5 @@ mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR cd $NNI_SYS_DIR
sh install_nni.sh sh install_nni.sh
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} --nni_manager_version '{11}' --log_collection '{12}'` python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} \
+ `1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr` --nni_manager_version '{11}' --log_collection '{12}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`;
...@@ -20,11 +20,10 @@ ...@@ -20,11 +20,10 @@
'use strict'; 'use strict';
import * as assert from 'assert'; import * as assert from 'assert';
import { MethodNotImplementedError, NNIError, NNIErrorNames } from '../../common/errors';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { NNIError, NNIErrorNames } from '../../common/errors';
import { TrialJobStatus } from '../../common/trainingService'; import { TrialJobStatus } from '../../common/trainingService';
import { KubernetesCRDClient } from './kubernetesApiClient'; import { KubernetesCRDClient } from './kubernetesApiClient';
import { MethodNotImplementedError } from '../../common/errors';
import { KubernetesTrialJobDetail } from './kubernetesData'; import { KubernetesTrialJobDetail } from './kubernetesData';
/** /**
...@@ -43,22 +42,22 @@ export class KubernetesJobInfoCollector { ...@@ -43,22 +42,22 @@ export class KubernetesJobInfoCollector {
public async retrieveTrialStatus(kubernetesCRDClient: KubernetesCRDClient | undefined) : Promise<void> { public async retrieveTrialStatus(kubernetesCRDClient: KubernetesCRDClient | undefined) : Promise<void> {
assert(kubernetesCRDClient !== undefined); assert(kubernetesCRDClient !== undefined);
const updateKubernetesTrialJobs : Promise<void>[] = []; const updateKubernetesTrialJobs : Promise<void>[] = [];
for(let [trialJobId, kubernetesTrialJob] of this.trialJobsMap) { for (const [trialJobId, kubernetesTrialJob] of this.trialJobsMap) {
if (!kubernetesTrialJob) { if (kubernetesTrialJob === undefined) {
throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`); throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`);
} }
// Since Kubeflow needs some delay to schedule jobs, we provide 20 seconds buffer time to check kubeflow job's status // Since Kubeflow needs some delay to schedule jobs, we provide 20 seconds buffer time to check kubeflow job's status
if( Date.now() - kubernetesTrialJob.submitTime < 20 * 1000) { if (Date.now() - kubernetesTrialJob.submitTime < 20 * 1000) {
return Promise.resolve(); return Promise.resolve();
} }
updateKubernetesTrialJobs.push(this.retrieveSingleTrialJobInfo(kubernetesCRDClient, kubernetesTrialJob)) updateKubernetesTrialJobs.push(this.retrieveSingleTrialJobInfo(kubernetesCRDClient, kubernetesTrialJob));
} }
await Promise.all(updateKubernetesTrialJobs); await Promise.all(updateKubernetesTrialJobs);
} }
protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined, protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> { kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> {
throw new MethodNotImplementedError(); throw new MethodNotImplementedError();
} }
} }
\ No newline at end of file
...@@ -19,19 +19,19 @@ ...@@ -19,19 +19,19 @@
'use strict'; 'use strict';
import * as component from '../../common/component';
import { Inject } from 'typescript-ioc'; import { Inject } from 'typescript-ioc';
import * as component from '../../common/component';
import { ClusterJobRestServer } from '../common/clusterJobRestServer';
import { KubernetesTrainingService } from './kubernetesTrainingService'; import { KubernetesTrainingService } from './kubernetesTrainingService';
import { ClusterJobRestServer } from '../common/clusterJobRestServer'
/** /**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update * Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
* *
*/ */
@component.Singleton @component.Singleton
export class KubernetesJobRestServer extends ClusterJobRestServer{ export class KubernetesJobRestServer extends ClusterJobRestServer {
@Inject @Inject
private kubernetesTrainingService? : KubernetesTrainingService; private readonly kubernetesTrainingService? : KubernetesTrainingService;
/** /**
* constructor to provide NNIRestServer's own rest property, e.g. port * constructor to provide NNIRestServer's own rest property, e.g. port
...@@ -41,8 +41,9 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{ ...@@ -41,8 +41,9 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
this.kubernetesTrainingService = kubernetesTrainingService; this.kubernetesTrainingService = kubernetesTrainingService;
} }
// tslint:disable-next-line:no-any
protected handleTrialMetrics(jobId : string, metrics : any[]) : void { protected handleTrialMetrics(jobId : string, metrics : any[]) : void {
if(!this.kubernetesTrainingService) { if (this.kubernetesTrainingService === undefined) {
throw Error('kubernetesTrainingService not initialized!'); throw Error('kubernetesTrainingService not initialized!');
} }
// Split metrics array into single metric, then emit // Split metrics array into single metric, then emit
...@@ -53,5 +54,5 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{ ...@@ -53,5 +54,5 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
data : singleMetric data : singleMetric
}); });
} }
} }
} }
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment