Commit ae7a72bc authored by Hongarc's avatar Hongarc Committed by Chi Song
Browse files

Remove all whitespace at end of line (#1162)

parent 14c1b31c
...@@ -32,7 +32,7 @@ import { Writable } from 'stream'; ...@@ -32,7 +32,7 @@ import { Writable } from 'stream';
/** /**
* Cluster Job Training service Rest server, provides rest API to support Cluster job metrics update * Cluster Job Training service Rest server, provides rest API to support Cluster job metrics update
* *
*/ */
@component.Singleton @component.Singleton
export abstract class ClusterJobRestServer extends RestServer{ export abstract class ClusterJobRestServer extends RestServer{
...@@ -52,8 +52,8 @@ export abstract class ClusterJobRestServer extends RestServer{ ...@@ -52,8 +52,8 @@ export abstract class ClusterJobRestServer extends RestServer{
super(); super();
const basePort: number = getBasePort(); const basePort: number = getBasePort();
assert(basePort && basePort > 1024); assert(basePort && basePort > 1024);
this.port = basePort + 1; this.port = basePort + 1;
} }
public get clusterRestServerPort(): number { public get clusterRestServerPort(): number {
...@@ -62,11 +62,11 @@ export abstract class ClusterJobRestServer extends RestServer{ ...@@ -62,11 +62,11 @@ export abstract class ClusterJobRestServer extends RestServer{
} }
return this.port; return this.port;
} }
public get getErrorMessage(): string | undefined{ public get getErrorMessage(): string | undefined{
return this.errorMessage; return this.errorMessage;
} }
public set setEnableVersionCheck(versionCheck: boolean) { public set setEnableVersionCheck(versionCheck: boolean) {
this.enableVersionCheck = versionCheck; this.enableVersionCheck = versionCheck;
} }
......
...@@ -19,12 +19,12 @@ ...@@ -19,12 +19,12 @@
'use strict'; 'use strict';
export const CONTAINER_INSTALL_NNI_SHELL_FORMAT: string = export const CONTAINER_INSTALL_NNI_SHELL_FORMAT: string =
`#!/bin/bash `#!/bin/bash
if python3 -c 'import nni' > /dev/null 2>&1; then if python3 -c 'import nni' > /dev/null 2>&1; then
# nni module is already installed, skip # nni module is already installed, skip
return return
else else
# Install nni # Install nni
python3 -m pip install --user --upgrade nni python3 -m pip install --user --upgrade nni
fi`; fi`;
\ No newline at end of file
...@@ -59,7 +59,7 @@ export class GPUSummary { ...@@ -59,7 +59,7 @@ export class GPUSummary {
} }
} }
export const GPU_INFO_COLLECTOR_FORMAT_LINUX: string = export const GPU_INFO_COLLECTOR_FORMAT_LINUX: string =
` `
#!/bin/bash #!/bin/bash
export METRIC_OUTPUT_DIR={0} export METRIC_OUTPUT_DIR={0}
...@@ -67,7 +67,7 @@ echo $$ >{1} ...@@ -67,7 +67,7 @@ echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector python3 -m nni_gpu_tool.gpu_metrics_collector
` `
export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string = export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string =
` `
$env:METRIC_OUTPUT_DIR="{0}" $env:METRIC_OUTPUT_DIR="{0}"
$app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow $app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow
......
...@@ -34,7 +34,7 @@ import { file } from "../../node_modules/@types/tmp"; ...@@ -34,7 +34,7 @@ import { file } from "../../node_modules/@types/tmp";
/** /**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken * Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
* *
* @param codeDir codeDir in nni config file * @param codeDir codeDir in nni config file
* @returns file number under codeDir * @returns file number under codeDir
*/ */
...@@ -48,9 +48,9 @@ export async function validateCodeDir(codeDir: string) : Promise<number> { ...@@ -48,9 +48,9 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
} }
if(fileCount && fileCount > 1000) { if(fileCount && fileCount > 1000) {
const errMessage: string = `Too many files(${fileCount} found}) in ${codeDir},` const errMessage: string = `Too many files(${fileCount} found}) in ${codeDir},`
+ ` please check if it's a valid code dir`; + ` please check if it's a valid code dir`;
throw new Error(errMessage); throw new Error(errMessage);
} }
return fileCount; return fileCount;
...@@ -58,7 +58,7 @@ export async function validateCodeDir(codeDir: string) : Promise<number> { ...@@ -58,7 +58,7 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
/** /**
* crete a new directory * crete a new directory
* @param directory * @param directory
*/ */
export async function execMkdir(directory: string): Promise<void> { export async function execMkdir(directory: string): Promise<void> {
if (process.platform === 'win32') { if (process.platform === 'win32') {
...@@ -85,7 +85,7 @@ export async function execCopydir(source: string, destination: string): Promise< ...@@ -85,7 +85,7 @@ export async function execCopydir(source: string, destination: string): Promise<
/** /**
* crete a new file * crete a new file
* @param filename * @param filename
*/ */
export async function execNewFile(filename: string): Promise<void> { export async function execNewFile(filename: string): Promise<void> {
if (process.platform === 'win32') { if (process.platform === 'win32') {
...@@ -110,7 +110,7 @@ export function execScript(filePath: string): cp.ChildProcess { ...@@ -110,7 +110,7 @@ export function execScript(filePath: string): cp.ChildProcess {
/** /**
* output the last line of a file * output the last line of a file
* @param filePath * @param filePath
*/ */
export async function execTail(filePath: string): Promise<cpp.childProcessPromise.Result> { export async function execTail(filePath: string): Promise<cpp.childProcessPromise.Result> {
let cmdresult: cpp.childProcessPromise.Result; let cmdresult: cpp.childProcessPromise.Result;
...@@ -124,7 +124,7 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis ...@@ -124,7 +124,7 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis
/** /**
* delete a directory * delete a directory
* @param directory * @param directory
*/ */
export async function execRemove(directory: string): Promise<void> { export async function execRemove(directory: string): Promise<void> {
if (process.platform === 'win32') { if (process.platform === 'win32') {
...@@ -137,7 +137,7 @@ export async function execRemove(directory: string): Promise<void> { ...@@ -137,7 +137,7 @@ export async function execRemove(directory: string): Promise<void> {
/** /**
* kill a process * kill a process
* @param directory * @param directory
*/ */
export async function execKill(pid: string): Promise<void> { export async function execKill(pid: string): Promise<void> {
if (process.platform === 'win32') { if (process.platform === 'win32') {
...@@ -151,7 +151,7 @@ export async function execKill(pid: string): Promise<void> { ...@@ -151,7 +151,7 @@ export async function execKill(pid: string): Promise<void> {
/** /**
* set environment variable * set environment variable
* @param variable * @param variable
* @returns command string * @returns command string
*/ */
export function setEnvironmentVariable(variable: { key: string; value: string }): string { export function setEnvironmentVariable(variable: { key: string; value: string }): string {
if (process.platform === 'win32') { if (process.platform === 'win32') {
...@@ -191,7 +191,7 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi ...@@ -191,7 +191,7 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi
/** /**
* generate script file name * generate script file name
* @param fileNamePrefix * @param fileNamePrefix
*/ */
export function getScriptName(fileNamePrefix: string): string { export function getScriptName(fileNamePrefix: string): string {
if (process.platform === 'win32') { if (process.platform === 'win32') {
...@@ -203,7 +203,7 @@ export function getScriptName(fileNamePrefix: string): string { ...@@ -203,7 +203,7 @@ export function getScriptName(fileNamePrefix: string): string {
/** /**
* generate script file * generate script file
* @param gpuMetricCollectorScriptFolder * @param gpuMetricCollectorScriptFolder
*/ */
export function getgpuMetricsCollectorScriptContent(gpuMetricCollectorScriptFolder: string): string { export function getgpuMetricsCollectorScriptContent(gpuMetricCollectorScriptFolder: string): string {
if(process.platform === 'win32') { if(process.platform === 'win32') {
......
...@@ -26,11 +26,11 @@ import { getLogger } from '../../common/log'; ...@@ -26,11 +26,11 @@ import { getLogger } from '../../common/log';
import { mkDirP } from '../../common/utils'; import { mkDirP } from '../../common/utils';
export namespace AzureStorageClientUtility { export namespace AzureStorageClientUtility {
/** /**
* create azure share * create azure share
* @param fileServerClient * @param fileServerClient
* @param azureShare * @param azureShare
*/ */
export async function createShare(fileServerClient: any, azureShare: any): Promise<void>{ export async function createShare(fileServerClient: any, azureShare: any): Promise<void>{
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
...@@ -44,12 +44,12 @@ export namespace AzureStorageClientUtility { ...@@ -44,12 +44,12 @@ export namespace AzureStorageClientUtility {
}) })
return deferred.promise; return deferred.promise;
} }
/** /**
* Create a new directory (NOT recursively) in azure file storage. * Create a new directory (NOT recursively) in azure file storage.
* @param fileServerClient * @param fileServerClient
* @param azureFoler * @param azureFoler
* @param azureShare * @param azureShare
*/ */
export async function createDirectory(fileServerClient: any, azureFoler: any, azureShare: any): Promise<void>{ export async function createDirectory(fileServerClient: any, azureFoler: any, azureShare: any): Promise<void>{
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
...@@ -67,7 +67,7 @@ export namespace AzureStorageClientUtility { ...@@ -67,7 +67,7 @@ export namespace AzureStorageClientUtility {
/** /**
* Create a new directory recursively in azure file storage * Create a new directory recursively in azure file storage
* @param fileServerClient * @param fileServerClient
* @param azureDirectory * @param azureDirectory
*/ */
export async function createDirectoryRecursive(fileServerClient: any, azureDirectory: any, azureShare: any): Promise<void>{ export async function createDirectoryRecursive(fileServerClient: any, azureDirectory: any, azureShare: any): Promise<void>{
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
...@@ -81,14 +81,14 @@ export namespace AzureStorageClientUtility { ...@@ -81,14 +81,14 @@ export namespace AzureStorageClientUtility {
deferred.resolve(); deferred.resolve();
return deferred.promise; return deferred.promise;
} }
/** /**
* upload a file to azure storage * upload a file to azure storage
* @param fileServerClient * @param fileServerClient
* @param azureDirectory * @param azureDirectory
* @param azureFileName * @param azureFileName
* @param azureShare * @param azureShare
* @param localFilePath * @param localFilePath
*/ */
async function uploadFileToAzure(fileServerClient: any, azureDirectory: any, azureFileName: any, azureShare: any, localFilePath: any): Promise<void>{ async function uploadFileToAzure(fileServerClient: any, azureDirectory: any, azureFileName: any, azureShare: any, localFilePath: any): Promise<void>{
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
...@@ -96,20 +96,20 @@ export namespace AzureStorageClientUtility { ...@@ -96,20 +96,20 @@ export namespace AzureStorageClientUtility {
if(error){ if(error){
getLogger().error(`Upload file failed:, ${error}`); getLogger().error(`Upload file failed:, ${error}`);
deferred.reject(error); deferred.reject(error);
}else{ }else{
deferred.resolve(); deferred.resolve();
} }
}) })
return deferred.promise; return deferred.promise;
} }
/** /**
* download a file from azure storage * download a file from azure storage
* @param fileServerClient * @param fileServerClient
* @param azureDirectory * @param azureDirectory
* @param azureFileName * @param azureFileName
* @param azureShare * @param azureShare
* @param localFilePath * @param localFilePath
*/ */
async function downloadFile(fileServerClient: any, azureDirectory: any, azureFileName: any, azureShare: any, localFilePath: any): Promise<void>{ async function downloadFile(fileServerClient: any, azureDirectory: any, azureFileName: any, azureShare: any, localFilePath: any): Promise<void>{
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
...@@ -118,7 +118,7 @@ export namespace AzureStorageClientUtility { ...@@ -118,7 +118,7 @@ export namespace AzureStorageClientUtility {
getLogger().error(`Download file failed:, ${error}`); getLogger().error(`Download file failed:, ${error}`);
deferred.reject(error); deferred.reject(error);
}else{ }else{
deferred.resolve(); deferred.resolve();
} }
}) })
return deferred.promise; return deferred.promise;
...@@ -153,13 +153,13 @@ export namespace AzureStorageClientUtility { ...@@ -153,13 +153,13 @@ export namespace AzureStorageClientUtility {
deferred.resolve(); deferred.resolve();
return deferred.promise; return deferred.promise;
} }
/** /**
* downlod a directory from azure * downlod a directory from azure
* @param fileServerClient * @param fileServerClient
* @param azureDirectory * @param azureDirectory
* @param azureShare * @param azureShare
* @param localDirectory * @param localDirectory
*/ */
export async function downloadDirectory(fileServerClient: any, azureDirectory:any, azureShare: any, localDirectory: any): Promise<void>{ export async function downloadDirectory(fileServerClient: any, azureDirectory:any, azureShare: any, localDirectory: any): Promise<void>{
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
...@@ -184,7 +184,7 @@ export namespace AzureStorageClientUtility { ...@@ -184,7 +184,7 @@ export namespace AzureStorageClientUtility {
const fullFilePath: string = path.join(localDirectory, fileName.name); const fullFilePath: string = path.join(localDirectory, fileName.name);
downloadFile(fileServerClient, azureDirectory, fileName.name, azureShare, fullFilePath) downloadFile(fileServerClient, azureDirectory, fileName.name, azureShare, fullFilePath)
} }
for(var directoryName of result['entries']['directories']){ for(var directoryName of result['entries']['directories']){
const fullDirectoryPath: string = path.join(localDirectory, directoryName.name) const fullDirectoryPath: string = path.join(localDirectory, directoryName.name)
const fullAzureDirectory: string = path.join(azureDirectory, directoryName.name) const fullAzureDirectory: string = path.join(azureDirectory, directoryName.name)
......
...@@ -47,7 +47,7 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient { ...@@ -47,7 +47,7 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient {
public get containerName(): string { public get containerName(): string {
return 'framework'; return 'framework';
} }
} }
export { FrameworkControllerClient, GeneralK8sClient }; export { FrameworkControllerClient, GeneralK8sClient };
......
...@@ -40,8 +40,8 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi ...@@ -40,8 +40,8 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi
public readonly frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy; public readonly frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy;
public readonly name: string; public readonly name: string;
public readonly taskNum: number; public readonly taskNum: number;
constructor(taskNum: number, command : string, gpuNum : number, constructor(taskNum: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string, cpuNum: number, memoryMB: number, image: string,
frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy) { frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy) {
super(command, gpuNum, cpuNum, memoryMB, image); super(command, gpuNum, cpuNum, memoryMB, image);
this.frameworkAttemptCompletionPolicy = frameworkAttemptCompletionPolicy; this.frameworkAttemptCompletionPolicy = frameworkAttemptCompletionPolicy;
...@@ -71,8 +71,8 @@ export class FrameworkControllerClusterConfig extends KubernetesClusterConfig { ...@@ -71,8 +71,8 @@ export class FrameworkControllerClusterConfig extends KubernetesClusterConfig {
export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfigNFS { export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfigNFS {
public readonly serviceAccountName: string; public readonly serviceAccountName: string;
constructor( constructor(
serviceAccountName: string, serviceAccountName: string,
apiVersion: string, apiVersion: string,
nfs: NFSConfig, nfs: NFSConfig,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind
) { ) {
...@@ -94,12 +94,12 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig ...@@ -94,12 +94,12 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig
export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConfigAzure { export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConfigAzure {
public readonly serviceAccountName: string; public readonly serviceAccountName: string;
constructor( constructor(
serviceAccountName: string, serviceAccountName: string,
apiVersion: string, apiVersion: string,
keyVault: keyVaultConfig, keyVault: keyVaultConfig,
azureStorage: AzureStorage, azureStorage: AzureStorage,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind
) { ) {
super(apiVersion, keyVault, azureStorage,storage); super(apiVersion, keyVault, azureStorage,storage);
......
...@@ -32,7 +32,7 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec ...@@ -32,7 +32,7 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
super(jobMap); super(jobMap);
} }
protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined, protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> { kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> {
if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) { if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) {
return Promise.resolve(); return Promise.resolve();
...@@ -44,7 +44,7 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec ...@@ -44,7 +44,7 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
let kubernetesJobInfo: any; let kubernetesJobInfo: any;
try { try {
kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName); kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName);
} catch(error) { } catch(error) {
this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`); this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`);
//This is not treat as a error status //This is not treat as a error status
...@@ -71,9 +71,9 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec ...@@ -71,9 +71,9 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
break; break;
case 'Failed': case 'Failed':
kubernetesTrialJob.status = 'FAILED'; kubernetesTrialJob.status = 'FAILED';
break; break;
} }
kubernetesTrialJob.endTime = Date.parse(<string>kubernetesJobInfo.status.completionTime); kubernetesTrialJob.endTime = Date.parse(<string>kubernetesJobInfo.status.completionTime);
break; break;
default: default:
break; break;
......
...@@ -25,11 +25,11 @@ import { KubernetesJobRestServer } from '../kubernetesJobRestServer' ...@@ -25,11 +25,11 @@ import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
/** /**
* frameworkcontroller Training service Rest server, provides rest API to support frameworkcontroller job metrics update * frameworkcontroller Training service Rest server, provides rest API to support frameworkcontroller job metrics update
* *
*/ */
@component.Singleton @component.Singleton
export class FrameworkControllerJobRestServer extends KubernetesJobRestServer{ export class FrameworkControllerJobRestServer extends KubernetesJobRestServer{
constructor() { constructor() {
super(component.get(FrameworkControllerTrainingService)); super(component.get(FrameworkControllerTrainingService));
} }
} }
\ No newline at end of file
...@@ -37,7 +37,7 @@ import { KubernetesTrialJobDetail } from '../kubernetesData'; ...@@ -37,7 +37,7 @@ import { KubernetesTrialJobDetail } from '../kubernetesData';
import { validateCodeDir } from '../../common/util'; import { validateCodeDir } from '../../common/util';
import { AzureStorageClientUtility } from '../azureStorageClientUtils'; import { AzureStorageClientUtility } from '../azureStorageClientUtils';
import { KubernetesTrainingService } from '../kubernetesTrainingService'; import { KubernetesTrainingService } from '../kubernetesTrainingService';
import { FrameworkControllerTrialConfig, FrameworkControllerClusterConfig, FrameworkControllerClusterConfigAzure, FrameworkControllerClusterConfigNFS, import { FrameworkControllerTrialConfig, FrameworkControllerClusterConfig, FrameworkControllerClusterConfigAzure, FrameworkControllerClusterConfigNFS,
FrameworkControllerClusterConfigFactory} from './frameworkcontrollerConfig'; FrameworkControllerClusterConfigFactory} from './frameworkcontrollerConfig';
import { FrameworkControllerJobRestServer } from './frameworkcontrollerJobRestServer'; import { FrameworkControllerJobRestServer } from './frameworkcontrollerJobRestServer';
import { FrameworkControllerClient } from './frameworkcontrollerApiClient'; import { FrameworkControllerClient } from './frameworkcontrollerApiClient';
...@@ -56,7 +56,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -56,7 +56,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
constructor() { constructor() {
super(); super();
this.fcJobInfoCollector = new FrameworkControllerJobInfoCollector(this.trialJobsMap); this.fcJobInfoCollector = new FrameworkControllerJobInfoCollector(this.trialJobsMap);
this.experimentId = getExperimentId(); this.experimentId = getExperimentId();
this.nextTrialSequenceId = -1; this.nextTrialSequenceId = -1;
} }
...@@ -69,7 +69,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -69,7 +69,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
this.kubernetesJobRestServer.setEnableVersionCheck = this.versionCheck; this.kubernetesJobRestServer.setEnableVersionCheck = this.versionCheck;
this.log.info(`frameworkcontroller Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`); this.log.info(`frameworkcontroller Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`);
while (!this.stopping) { while (!this.stopping) {
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server // collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
await delay(3000); await delay(3000);
await this.fcJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient); await this.fcJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient);
if(this.kubernetesJobRestServer.getErrorMessage) { if(this.kubernetesJobRestServer.getErrorMessage) {
...@@ -101,10 +101,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -101,10 +101,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
//Generate the port used for taskRole //Generate the port used for taskRole
this.generateContainerPort(); this.generateContainerPort();
await this.prepareRunScript(trialLocalTempFolder, curTrialSequenceId, trialJobId, trialWorkingFolder, form); await this.prepareRunScript(trialLocalTempFolder, curTrialSequenceId, trialJobId, trialWorkingFolder, form);
//upload code files //upload code files
let trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); let trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder);
const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail( const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail(
trialJobId, trialJobId,
'WAITING', 'WAITING',
...@@ -116,14 +116,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -116,14 +116,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
trialJobOutputUrl trialJobOutputUrl
); );
// Set trial job detail until create frameworkcontroller job successfully // Set trial job detail until create frameworkcontroller job successfully
this.trialJobsMap.set(trialJobId, trialJobDetail); this.trialJobsMap.set(trialJobId, trialJobDetail);
// Create frameworkcontroller job based on generated frameworkcontroller job resource config // Create frameworkcontroller job based on generated frameworkcontroller job resource config
const frameworkcontrollerJobConfig = await this.prepareFrameworkControllerConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName); const frameworkcontrollerJobConfig = await this.prepareFrameworkControllerConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName);
await this.kubernetesCRDClient.createKubernetesJob(frameworkcontrollerJobConfig); await this.kubernetesCRDClient.createKubernetesJob(frameworkcontrollerJobConfig);
// Set trial job detail until create frameworkcontroller job successfully // Set trial job detail until create frameworkcontroller job successfully
this.trialJobsMap.set(trialJobId, trialJobDetail); this.trialJobsMap.set(trialJobId, trialJobDetail);
return Promise.resolve(trialJobDetail); return Promise.resolve(trialJobDetail);
...@@ -131,8 +131,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -131,8 +131,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
/** /**
* upload code files to nfs or azureStroage * upload code files to nfs or azureStroage
* @param trialJobId * @param trialJobId
* @param trialLocalTempFolder * @param trialLocalTempFolder
* return: trialJobOutputUrl * return: trialJobOutputUrl
*/ */
private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise<string> { private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise<string> {
...@@ -145,7 +145,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -145,7 +145,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
if(this.fcClusterConfig.storageType === 'azureStorage') { if(this.fcClusterConfig.storageType === 'azureStorage') {
try{ try{
//upload local files to azure storage //upload local files to azure storage
await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`); `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`);
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}` trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}`
...@@ -155,21 +155,21 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -155,21 +155,21 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
} }
} else if(this.fcClusterConfig.storageType === 'nfs') { } else if(this.fcClusterConfig.storageType === 'nfs') {
let nfsFrameworkControllerClusterConfig: FrameworkControllerClusterConfigNFS = <FrameworkControllerClusterConfigNFS>this.fcClusterConfig; let nfsFrameworkControllerClusterConfig: FrameworkControllerClusterConfigNFS = <FrameworkControllerClusterConfigNFS>this.fcClusterConfig;
// Creat work dir for current trial in NFS directory // Creat work dir for current trial in NFS directory
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`); await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`);
// Copy code files from local dir to NFS mounted dir // Copy code files from local dir to NFS mounted dir
await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`); await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
const nfsConfig: NFSConfig = nfsFrameworkControllerClusterConfig.nfs; const nfsConfig: NFSConfig = nfsFrameworkControllerClusterConfig.nfs;
trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}` trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`
} }
return Promise.resolve(trialJobOutputUrl); return Promise.resolve(trialJobOutputUrl);
} }
/** /**
* generate trial's command for frameworkcontroller * generate trial's command for frameworkcontroller
* expose port and execute injector.sh before executing user's command * expose port and execute injector.sh before executing user's command
* @param command * @param command
*/ */
private generateCommandScript(command: string): string { private generateCommandScript(command: string): string {
let portScript = ''; let portScript = '';
...@@ -181,7 +181,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -181,7 +181,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
} }
return `${portScript} . /mnt/frameworkbarrier/injector.sh && ${command}`; return `${portScript} . /mnt/frameworkbarrier/injector.sh && ${command}`;
} }
private async prepareRunScript(trialLocalTempFolder: string, curTrialSequenceId: number, trialJobId: string, trialWorkingFolder: string, form: JobApplicationForm): Promise<void> { private async prepareRunScript(trialLocalTempFolder: string, curTrialSequenceId: number, trialJobId: string, trialWorkingFolder: string, form: JobApplicationForm): Promise<void> {
if(!this.fcTrialConfig) { if(!this.fcTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized'); throw new Error('frameworkcontroller trial config is not initialized');
...@@ -196,7 +196,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -196,7 +196,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`); await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
for(let taskRole of this.fcTrialConfig.taskRoles) { for(let taskRole of this.fcTrialConfig.taskRoles) {
const runScriptContent: string = await this.generateRunScript('frameworkcontroller', trialJobId, trialWorkingFolder, const runScriptContent: string = await this.generateRunScript('frameworkcontroller', trialJobId, trialWorkingFolder,
this.generateCommandScript(taskRole.command), curTrialSequenceId.toString(), taskRole.name, taskRole.gpuNum); this.generateCommandScript(taskRole.command), curTrialSequenceId.toString(), taskRole.name, taskRole.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, `run_${taskRole.name}.sh`), runScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, `run_${taskRole.name}.sh`), runScriptContent, { encoding: 'utf8' });
} }
...@@ -204,11 +204,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -204,11 +204,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
// Write file content ( parameter.cfg ) to local tmp folders // Write file content ( parameter.cfg ) to local tmp folders
const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form) const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form)
if(trialForm && trialForm.hyperParameters) { if(trialForm && trialForm.hyperParameters) {
await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)), await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)),
trialForm.hyperParameters.value, { encoding: 'utf8' }); trialForm.hyperParameters.value, { encoding: 'utf8' });
} }
} }
private async prepareFrameworkControllerConfig(trialJobId: string, trialWorkingFolder: string, frameworkcontrollerJobName: string): Promise<any> { private async prepareFrameworkControllerConfig(trialJobId: string, trialWorkingFolder: string, frameworkcontrollerJobName: string): Promise<any> {
if(!this.fcTrialConfig) { if(!this.fcTrialConfig) {
...@@ -222,18 +222,18 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -222,18 +222,18 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
resource.limits = Object.assign({}, resource.requests); resource.limits = Object.assign({}, resource.requests);
podResources.push(resource); podResources.push(resource);
} }
// Generate frameworkcontroller job resource config object // Generate frameworkcontroller job resource config object
const frameworkcontrollerJobConfig: any = this.generateFrameworkControllerJobConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName, podResources); const frameworkcontrollerJobConfig: any = this.generateFrameworkControllerJobConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName, podResources);
return Promise.resolve(frameworkcontrollerJobConfig); return Promise.resolve(frameworkcontrollerJobConfig);
} }
public async setClusterMetadata(key: string, value: string): Promise<void> { public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) { switch (key) {
case TrialConfigMetadataKey.NNI_MANAGER_IP: case TrialConfigMetadataKey.NNI_MANAGER_IP:
this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value); this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value);
break; break;
case TrialConfigMetadataKey.FRAMEWORKCONTROLLER_CLUSTER_CONFIG: case TrialConfigMetadataKey.FRAMEWORKCONTROLLER_CLUSTER_CONFIG:
let frameworkcontrollerClusterJsonObject = JSON.parse(value); let frameworkcontrollerClusterJsonObject = JSON.parse(value);
this.fcClusterConfig = FrameworkControllerClusterConfigFactory.generateFrameworkControllerClusterConfig(frameworkcontrollerClusterJsonObject); this.fcClusterConfig = FrameworkControllerClusterConfigFactory.generateFrameworkControllerClusterConfig(frameworkcontrollerClusterJsonObject);
...@@ -253,7 +253,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -253,7 +253,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
nfsFrameworkControllerClusterConfig.nfs.server, nfsFrameworkControllerClusterConfig.nfs.server,
nfsFrameworkControllerClusterConfig.nfs.path nfsFrameworkControllerClusterConfig.nfs.path
); );
} }
this.kubernetesCRDClient = FrameworkControllerClient.generateFrameworkControllerClient(); this.kubernetesCRDClient = FrameworkControllerClient.generateFrameworkControllerClient();
break; break;
case TrialConfigMetadataKey.TRIAL_CONFIG: case TrialConfigMetadataKey.TRIAL_CONFIG:
...@@ -269,7 +269,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -269,7 +269,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
await validateCodeDir(this.fcTrialConfig.codeDir); await validateCodeDir(this.fcTrialConfig.codeDir);
} catch(error) { } catch(error) {
this.log.error(error); this.log.error(error);
return Promise.reject(new Error(error)); return Promise.reject(new Error(error));
} }
break; break;
case TrialConfigMetadataKey.VERSION_CHECK: case TrialConfigMetadataKey.VERSION_CHECK:
...@@ -284,7 +284,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -284,7 +284,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
return Promise.resolve(); return Promise.resolve();
} }
private generateContainerPort() { private generateContainerPort() {
if(!this.fcTrialConfig) { if(!this.fcTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized'); throw new Error('frameworkcontroller trial config is not initialized');
...@@ -312,7 +312,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -312,7 +312,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
if(!this.fcTrialConfig) { if(!this.fcTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized'); throw new Error('frameworkcontroller trial config is not initialized');
} }
let taskRoles = []; let taskRoles = [];
for(let index in this.fcTrialConfig.taskRoles) { for(let index in this.fcTrialConfig.taskRoles) {
let containerPort = this.fcContainerPortMap.get(this.fcTrialConfig.taskRoles[index].name); let containerPort = this.fcContainerPortMap.get(this.fcTrialConfig.taskRoles[index].name);
...@@ -320,8 +320,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -320,8 +320,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
throw new Error('Container port is not initialized'); throw new Error('Container port is not initialized');
} }
let taskRole = this.generateTaskRoleConfig( let taskRole = this.generateTaskRoleConfig(
trialWorkingFolder, trialWorkingFolder,
this.fcTrialConfig.taskRoles[index].image, this.fcTrialConfig.taskRoles[index].image,
`run_${this.fcTrialConfig.taskRoles[index].name}.sh`, `run_${this.fcTrialConfig.taskRoles[index].name}.sh`,
podResources[index], podResources[index],
containerPort containerPort
...@@ -330,17 +330,17 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -330,17 +330,17 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
name: this.fcTrialConfig.taskRoles[index].name, name: this.fcTrialConfig.taskRoles[index].name,
taskNumber: this.fcTrialConfig.taskRoles[index].taskNum, taskNumber: this.fcTrialConfig.taskRoles[index].taskNum,
frameworkAttemptCompletionPolicy: { frameworkAttemptCompletionPolicy: {
minFailedTaskCount: this.fcTrialConfig.taskRoles[index].frameworkAttemptCompletionPolicy.minFailedTaskCount, minFailedTaskCount: this.fcTrialConfig.taskRoles[index].frameworkAttemptCompletionPolicy.minFailedTaskCount,
minSucceededTaskCount: this.fcTrialConfig.taskRoles[index].frameworkAttemptCompletionPolicy.minSucceededTaskCount minSucceededTaskCount: this.fcTrialConfig.taskRoles[index].frameworkAttemptCompletionPolicy.minSucceededTaskCount
}, },
task: taskRole task: taskRole
}); });
} }
return { return {
apiVersion: `frameworkcontroller.microsoft.com/v1`, apiVersion: `frameworkcontroller.microsoft.com/v1`,
kind: 'Framework', kind: 'Framework',
metadata: { metadata: {
name: frameworkcontrollerJobName, name: frameworkcontrollerJobName,
namespace: 'default', namespace: 'default',
labels: { labels: {
...@@ -356,7 +356,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -356,7 +356,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}; };
} }
private generateTaskRoleConfig(trialWorkingFolder: string, replicaImage: string, runScriptFile: string, podResources: any, containerPort: number): any { private generateTaskRoleConfig(trialWorkingFolder: string, replicaImage: string, runScriptFile: string, podResources: any, containerPort: number): any {
if(!this.fcClusterConfig) { if(!this.fcClusterConfig) {
...@@ -366,7 +366,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -366,7 +366,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
if(!this.fcTrialConfig) { if(!this.fcTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized'); throw new Error('frameworkcontroller trial config is not initialized');
} }
let volumeSpecMap = new Map<string, object>(); let volumeSpecMap = new Map<string, object>();
if(this.fcClusterConfig.storageType === 'azureStorage'){ if(this.fcClusterConfig.storageType === 'azureStorage'){
volumeSpecMap.set('nniVolumes', [ volumeSpecMap.set('nniVolumes', [
...@@ -395,7 +395,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -395,7 +395,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
emptyDir: {} emptyDir: {}
}]) }])
} }
let containers = [ let containers = [
{ {
name: 'framework', name: 'framework',
...@@ -420,7 +420,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -420,7 +420,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
name: 'frameworkbarrier', name: 'frameworkbarrier',
image: 'frameworkcontroller/frameworkbarrier', image: 'frameworkcontroller/frameworkbarrier',
volumeMounts: [ volumeMounts: [
{ {
name: 'frameworkbarrier-volume', name: 'frameworkbarrier-volume',
mountPath: '/mnt/frameworkbarrier' mountPath: '/mnt/frameworkbarrier'
}] }]
...@@ -432,8 +432,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -432,8 +432,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
volumes: volumeSpecMap.get('nniVolumes'), volumes: volumeSpecMap.get('nniVolumes'),
hostNetwork: false hostNetwork: false
}; };
if(this.fcClusterConfig.serviceAccountName) { if(this.fcClusterConfig.serviceAccountName) {
spec.serviceAccountName = this.fcClusterConfig.serviceAccountName; spec.serviceAccountName = this.fcClusterConfig.serviceAccountName;
} }
let taskRole = { let taskRole = {
pod: { pod: {
......
...@@ -27,7 +27,7 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{ ...@@ -27,7 +27,7 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
/** /**
* Factory method to generate operator cliet * Factory method to generate operator cliet
*/ */
public static generateOperatorClient(kubeflowOperator: KubeflowOperator, public static generateOperatorClient(kubeflowOperator: KubeflowOperator,
operatorApiVersion: string): KubernetesCRDClient { operatorApiVersion: string): KubernetesCRDClient {
switch(kubeflowOperator) { switch(kubeflowOperator) {
case 'tf-operator': { case 'tf-operator': {
...@@ -78,7 +78,7 @@ class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient { ...@@ -78,7 +78,7 @@ class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
public get containerName(): string { public get containerName(): string {
return 'tensorflow'; return 'tensorflow';
} }
} }
class TFOperatorClientV1Beta1 extends KubernetesCRDClient { class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
...@@ -97,7 +97,7 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient { ...@@ -97,7 +97,7 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
public get containerName(): string { public get containerName(): string {
return 'tensorflow'; return 'tensorflow';
} }
} }
class TFOperatorClientV1Beta2 extends KubernetesCRDClient { class TFOperatorClientV1Beta2 extends KubernetesCRDClient {
......
...@@ -41,8 +41,8 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig { ...@@ -41,8 +41,8 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig {
export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS { export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
public readonly operator: KubeflowOperator; public readonly operator: KubeflowOperator;
constructor( constructor(
operator: KubeflowOperator, operator: KubeflowOperator,
apiVersion: string, apiVersion: string,
nfs: NFSConfig, nfs: NFSConfig,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind
) { ) {
...@@ -68,12 +68,12 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS { ...@@ -68,12 +68,12 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure{ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure{
public readonly operator: KubeflowOperator; public readonly operator: KubeflowOperator;
constructor( constructor(
operator: KubeflowOperator, operator: KubeflowOperator,
apiVersion: string, apiVersion: string,
keyVault: keyVaultConfig, keyVault: keyVaultConfig,
azureStorage: AzureStorage, azureStorage: AzureStorage,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind
) { ) {
super(apiVersion, keyVault, azureStorage,storage); super(apiVersion, keyVault, azureStorage,storage);
...@@ -124,7 +124,7 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig { ...@@ -124,7 +124,7 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate{ export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate{
public readonly replicas: number; public readonly replicas: number;
constructor(replicas: number, command : string, gpuNum : number, constructor(replicas: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) { cpuNum: number, memoryMB: number, image: string) {
super(command, gpuNum, cpuNum, memoryMB, image); super(command, gpuNum, cpuNum, memoryMB, image);
this.replicas = replicas; this.replicas = replicas;
......
...@@ -32,7 +32,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{ ...@@ -32,7 +32,7 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
super(jobMap); super(jobMap);
} }
protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined, protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> { kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> {
if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) { if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) {
return Promise.resolve(); return Promise.resolve();
...@@ -44,9 +44,9 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{ ...@@ -44,9 +44,9 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
let kubernetesJobInfo: any; let kubernetesJobInfo: any;
try { try {
kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName); kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName);
} catch(error) { } catch(error) {
// Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed. // Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed.
this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`); this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`);
//This is not treat as a error status //This is not treat as a error status
return Promise.resolve(); return Promise.resolve();
...@@ -58,8 +58,8 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{ ...@@ -58,8 +58,8 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
switch(tfJobType) { switch(tfJobType) {
case 'Created': case 'Created':
kubernetesTrialJob.status = 'WAITING'; kubernetesTrialJob.status = 'WAITING';
kubernetesTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime); kubernetesTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
break; break;
case 'Running': case 'Running':
kubernetesTrialJob.status = 'RUNNING'; kubernetesTrialJob.status = 'RUNNING';
if(!kubernetesTrialJob.startTime) { if(!kubernetesTrialJob.startTime) {
...@@ -68,11 +68,11 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{ ...@@ -68,11 +68,11 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
break; break;
case 'Failed': case 'Failed':
kubernetesTrialJob.status = 'FAILED'; kubernetesTrialJob.status = 'FAILED';
kubernetesTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime); kubernetesTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break; break;
case 'Succeeded': case 'Succeeded':
kubernetesTrialJob.status = 'SUCCEEDED'; kubernetesTrialJob.status = 'SUCCEEDED';
kubernetesTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime); kubernetesTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break; break;
default: default:
break; break;
......
...@@ -25,7 +25,7 @@ import { KubernetesJobRestServer } from '../kubernetesJobRestServer' ...@@ -25,7 +25,7 @@ import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
/** /**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update * Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
* *
*/ */
@component.Singleton @component.Singleton
export class KubeflowJobRestServer extends KubernetesJobRestServer{ export class KubeflowJobRestServer extends KubernetesJobRestServer{
...@@ -34,5 +34,5 @@ export class KubeflowJobRestServer extends KubernetesJobRestServer{ ...@@ -34,5 +34,5 @@ export class KubeflowJobRestServer extends KubernetesJobRestServer{
*/ */
constructor() { constructor() {
super(component.get(KubeflowTrainingService)); super(component.get(KubeflowTrainingService));
} }
} }
\ No newline at end of file
...@@ -57,9 +57,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -57,9 +57,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
private kubeflowJobInfoCollector: KubeflowJobInfoCollector; private kubeflowJobInfoCollector: KubeflowJobInfoCollector;
constructor() { constructor() {
super(); super();
this.kubeflowJobInfoCollector = new KubeflowJobInfoCollector(this.trialJobsMap); this.kubeflowJobInfoCollector = new KubeflowJobInfoCollector(this.trialJobsMap);
this.experimentId = getExperimentId(); this.experimentId = getExperimentId();
this.nextTrialSequenceId = -1; this.nextTrialSequenceId = -1;
this.log.info('Construct Kubeflow training service.'); this.log.info('Construct Kubeflow training service.');
} }
...@@ -74,7 +74,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -74,7 +74,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
this.kubernetesJobRestServer.setEnableVersionCheck = this.versionCheck; this.kubernetesJobRestServer.setEnableVersionCheck = this.versionCheck;
this.log.info(`Kubeflow Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`); this.log.info(`Kubeflow Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`);
while (!this.stopping) { while (!this.stopping) {
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server // collect metrics for Kubeflow jobs by interacting with Kubernetes API server
await delay(3000); await delay(3000);
await this.kubeflowJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient); await this.kubeflowJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient);
if(this.kubernetesJobRestServer.getErrorMessage) { if(this.kubernetesJobRestServer.getErrorMessage) {
...@@ -113,22 +113,22 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -113,22 +113,22 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
curTrialSequenceId, curTrialSequenceId,
trialJobOutputUrl trialJobOutputUrl
); );
// Generate kubeflow job resource config object // Generate kubeflow job resource config object
const kubeflowJobConfig: any = await this.prepareKubeflowConfig(trialJobId, trialWorkingFolder, kubeflowJobName); const kubeflowJobConfig: any = await this.prepareKubeflowConfig(trialJobId, trialWorkingFolder, kubeflowJobName);
// Create kubeflow job based on generated kubeflow job resource config // Create kubeflow job based on generated kubeflow job resource config
await this.kubernetesCRDClient.createKubernetesJob(kubeflowJobConfig); await this.kubernetesCRDClient.createKubernetesJob(kubeflowJobConfig);
// Set trial job detail until create Kubeflow job successfully // Set trial job detail until create Kubeflow job successfully
this.trialJobsMap.set(trialJobId, trialJobDetail); this.trialJobsMap.set(trialJobId, trialJobDetail);
return Promise.resolve(trialJobDetail); return Promise.resolve(trialJobDetail);
} }
/** /**
* upload code files to nfs or azureStroage * upload code files to nfs or azureStroage
* @param trialJobId * @param trialJobId
* @param trialLocalTempFolder * @param trialLocalTempFolder
* return: trialJobOutputUrl * return: trialJobOutputUrl
*/ */
private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise<string> { private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise<string> {
...@@ -138,14 +138,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -138,14 +138,14 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
let trialJobOutputUrl: string = ''; let trialJobOutputUrl: string = '';
assert(!this.kubeflowClusterConfig.storage assert(!this.kubeflowClusterConfig.storage
|| this.kubeflowClusterConfig.storage === 'azureStorage' || this.kubeflowClusterConfig.storage === 'azureStorage'
|| this.kubeflowClusterConfig.storage === 'nfs'); || this.kubeflowClusterConfig.storage === 'nfs');
if(this.kubeflowClusterConfig.storage === 'azureStorage') { if(this.kubeflowClusterConfig.storage === 'azureStorage') {
try{ try{
//upload local files to azure storage //upload local files to azure storage
await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`); `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`);
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}` trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}`
...@@ -155,18 +155,18 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -155,18 +155,18 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
} }
} else if(this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) { } else if(this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) {
let nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig; let nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig;
// Creat work dir for current trial in NFS directory // Creat work dir for current trial in NFS directory
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`); await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`);
// Copy code files from local dir to NFS mounted dir // Copy code files from local dir to NFS mounted dir
await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`); await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
const nfsConfig: NFSConfig = nfsKubeflowClusterConfig.nfs; const nfsConfig: NFSConfig = nfsKubeflowClusterConfig.nfs;
trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}` trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`
} }
return Promise.resolve(trialJobOutputUrl); return Promise.resolve(trialJobOutputUrl);
} }
private async prepareRunScript(trialLocalTempFolder: string, trialJobId: string, trialWorkingFolder: string, curTrialSequenceId: number, form: JobApplicationForm): Promise<void> { private async prepareRunScript(trialLocalTempFolder: string, trialJobId: string, trialWorkingFolder: string, curTrialSequenceId: number, form: JobApplicationForm): Promise<void> {
if(!this.kubeflowClusterConfig) { if(!this.kubeflowClusterConfig) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
...@@ -181,7 +181,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -181,7 +181,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}else { }else {
throw Error(`operator ${this.kubeflowClusterConfig.operator} is invalid`) throw Error(`operator ${this.kubeflowClusterConfig.operator} is invalid`)
} }
//create tmp trial working folder locally. //create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`); await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${kubeflowTrialConfig.codeDir} ${trialLocalTempFolder}`); await cpp.exec(`cp -r ${kubeflowTrialConfig.codeDir} ${trialLocalTempFolder}`);
...@@ -193,7 +193,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -193,7 +193,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// Write worker file content run_worker.sh to local tmp folders // Write worker file content run_worker.sh to local tmp folders
if(kubeflowTrialConfig.worker) { if(kubeflowTrialConfig.worker) {
const workerRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder, const workerRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder,
kubeflowTrialConfig.worker.command, curTrialSequenceId.toString(), 'worker', kubeflowTrialConfig.worker.gpuNum); kubeflowTrialConfig.worker.command, curTrialSequenceId.toString(), 'worker', kubeflowTrialConfig.worker.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_worker.sh'), workerRunScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_worker.sh'), workerRunScriptContent, { encoding: 'utf8' });
...@@ -202,7 +202,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -202,7 +202,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if(this.kubeflowClusterConfig.operator === 'tf-operator') { if(this.kubeflowClusterConfig.operator === 'tf-operator') {
let tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig; let tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
if(tensorflowTrialConfig.ps){ if(tensorflowTrialConfig.ps){
const psRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder, const psRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder,
tensorflowTrialConfig.ps.command, curTrialSequenceId.toString(), 'ps', tensorflowTrialConfig.ps.gpuNum); tensorflowTrialConfig.ps.command, curTrialSequenceId.toString(), 'ps', tensorflowTrialConfig.ps.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_ps.sh'), psRunScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_ps.sh'), psRunScriptContent, { encoding: 'utf8' });
} }
...@@ -210,7 +210,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -210,7 +210,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
else if(this.kubeflowClusterConfig.operator === 'pytorch-operator') { else if(this.kubeflowClusterConfig.operator === 'pytorch-operator') {
let pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig; let pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
if(pytorchTrialConfig.master){ if(pytorchTrialConfig.master){
const masterRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder, const masterRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder,
pytorchTrialConfig.master.command, curTrialSequenceId.toString(), 'master', pytorchTrialConfig.master.gpuNum); pytorchTrialConfig.master.command, curTrialSequenceId.toString(), 'master', pytorchTrialConfig.master.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_master.sh'), masterRunScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_master.sh'), masterRunScriptContent, { encoding: 'utf8' });
} }
...@@ -218,11 +218,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -218,11 +218,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// Write file content ( parameter.cfg ) to local tmp folders // Write file content ( parameter.cfg ) to local tmp folders
const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form) const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form)
if(trialForm && trialForm.hyperParameters) { if(trialForm && trialForm.hyperParameters) {
await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)), await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)),
trialForm.hyperParameters.value, { encoding: 'utf8' }); trialForm.hyperParameters.value, { encoding: 'utf8' });
} }
} }
private async prepareKubeflowConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName: string): Promise<any> { private async prepareKubeflowConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName: string): Promise<any> {
if(!this.kubeflowClusterConfig) { if(!this.kubeflowClusterConfig) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
...@@ -241,10 +241,10 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -241,10 +241,10 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
}else { }else {
throw Error(`operator ${this.kubeflowClusterConfig.operator} is invalid`) throw Error(`operator ${this.kubeflowClusterConfig.operator} is invalid`)
} }
const workerPodResources : any = {}; const workerPodResources : any = {};
if(kubeflowTrialConfig.worker) { if(kubeflowTrialConfig.worker) {
workerPodResources.requests = this.generatePodResource(kubeflowTrialConfig.worker.memoryMB, kubeflowTrialConfig.worker.cpuNum, workerPodResources.requests = this.generatePodResource(kubeflowTrialConfig.worker.memoryMB, kubeflowTrialConfig.worker.cpuNum,
kubeflowTrialConfig.worker.gpuNum) kubeflowTrialConfig.worker.gpuNum)
} }
workerPodResources.limits = Object.assign({}, workerPodResources.requests); workerPodResources.limits = Object.assign({}, workerPodResources.requests);
...@@ -253,30 +253,30 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -253,30 +253,30 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if(this.kubeflowClusterConfig.operator === 'tf-operator') { if(this.kubeflowClusterConfig.operator === 'tf-operator') {
let tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig; let tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
if (tensorflowTrialConfig.ps) { if (tensorflowTrialConfig.ps) {
nonWorkerResources.requests = this.generatePodResource(tensorflowTrialConfig.ps.memoryMB, tensorflowTrialConfig.ps.cpuNum, nonWorkerResources.requests = this.generatePodResource(tensorflowTrialConfig.ps.memoryMB, tensorflowTrialConfig.ps.cpuNum,
tensorflowTrialConfig.ps.gpuNum) tensorflowTrialConfig.ps.gpuNum)
nonWorkerResources.limits = Object.assign({}, nonWorkerResources.requests); nonWorkerResources.limits = Object.assign({}, nonWorkerResources.requests);
} }
}else if(this.kubeflowClusterConfig.operator === 'pytorch-operator'){ }else if(this.kubeflowClusterConfig.operator === 'pytorch-operator'){
let pyTorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig; let pyTorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
nonWorkerResources.requests = this.generatePodResource(pyTorchTrialConfig.master.memoryMB, pyTorchTrialConfig.master.cpuNum, nonWorkerResources.requests = this.generatePodResource(pyTorchTrialConfig.master.memoryMB, pyTorchTrialConfig.master.cpuNum,
pyTorchTrialConfig.master.gpuNum) pyTorchTrialConfig.master.gpuNum)
nonWorkerResources.limits = Object.assign({}, nonWorkerResources.requests); nonWorkerResources.limits = Object.assign({}, nonWorkerResources.requests);
}
// Generate kubeflow job resource config object }
// Generate kubeflow job resource config object
const kubeflowJobConfig: any = this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources, nonWorkerResources); const kubeflowJobConfig: any = this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources, nonWorkerResources);
return Promise.resolve(kubeflowJobConfig); return Promise.resolve(kubeflowJobConfig);
} }
public async setClusterMetadata(key: string, value: string): Promise<void> { public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) { switch (key) {
case TrialConfigMetadataKey.NNI_MANAGER_IP: case TrialConfigMetadataKey.NNI_MANAGER_IP:
this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value); this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value);
break; break;
case TrialConfigMetadataKey.KUBEFLOW_CLUSTER_CONFIG: case TrialConfigMetadataKey.KUBEFLOW_CLUSTER_CONFIG:
let kubeflowClusterJsonObject = JSON.parse(value); let kubeflowClusterJsonObject = JSON.parse(value);
this.kubeflowClusterConfig = KubeflowClusterConfigFactory.generateKubeflowClusterConfig(kubeflowClusterJsonObject); this.kubeflowClusterConfig = KubeflowClusterConfigFactory.generateKubeflowClusterConfig(kubeflowClusterJsonObject);
...@@ -296,7 +296,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -296,7 +296,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
nfsKubeflowClusterConfig.nfs.server, nfsKubeflowClusterConfig.nfs.server,
nfsKubeflowClusterConfig.nfs.path nfsKubeflowClusterConfig.nfs.path
); );
} }
this.kubernetesCRDClient = KubeflowOperatorClient.generateOperatorClient(this.kubeflowClusterConfig.operator, this.kubernetesCRDClient = KubeflowOperatorClient.generateOperatorClient(this.kubeflowClusterConfig.operator,
this.kubeflowClusterConfig.apiVersion); this.kubeflowClusterConfig.apiVersion);
break; break;
...@@ -304,13 +304,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -304,13 +304,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
case TrialConfigMetadataKey.TRIAL_CONFIG: case TrialConfigMetadataKey.TRIAL_CONFIG:
if (!this.kubeflowClusterConfig){ if (!this.kubeflowClusterConfig){
this.log.error('kubeflow cluster config is not initialized'); this.log.error('kubeflow cluster config is not initialized');
return Promise.reject(new Error('kubeflow cluster config is not initialized')); return Promise.reject(new Error('kubeflow cluster config is not initialized'));
} }
assert(this.kubeflowClusterConfig !== undefined) assert(this.kubeflowClusterConfig !== undefined)
let kubeflowTrialJsonObjsect = JSON.parse(value); let kubeflowTrialJsonObjsect = JSON.parse(value);
this.kubeflowTrialConfig = KubeflowTrialConfigFactory.generateKubeflowTrialConfig( this.kubeflowTrialConfig = KubeflowTrialConfigFactory.generateKubeflowTrialConfig(
kubeflowTrialJsonObjsect, kubeflowTrialJsonObjsect,
this.kubeflowClusterConfig.operator this.kubeflowClusterConfig.operator
); );
...@@ -319,7 +319,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -319,7 +319,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
await validateCodeDir(this.kubeflowTrialConfig.codeDir); await validateCodeDir(this.kubeflowTrialConfig.codeDir);
} catch(error) { } catch(error) {
this.log.error(error); this.log.error(error);
return Promise.reject(new Error(error)); return Promise.reject(new Error(error));
} }
break; break;
case TrialConfigMetadataKey.VERSION_CHECK: case TrialConfigMetadataKey.VERSION_CHECK:
...@@ -361,11 +361,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -361,11 +361,11 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if(this.kubeflowTrialConfig.operatorType === 'tf-operator') { if(this.kubeflowTrialConfig.operatorType === 'tf-operator') {
let tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig; let tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas, replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas,
tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources); tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources);
if (tensorflowTrialConfig.ps){ if (tensorflowTrialConfig.ps){
replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas, replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas,
tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources); tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources);
} }
replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {'tfReplicaSpecs': replicaSpecsObj}) replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {'tfReplicaSpecs': replicaSpecsObj})
...@@ -373,19 +373,19 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -373,19 +373,19 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
else if(this.kubeflowTrialConfig.operatorType === 'pytorch-operator') { else if(this.kubeflowTrialConfig.operatorType === 'pytorch-operator') {
let pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig; let pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
if(pytorchTrialConfig.worker) { if(pytorchTrialConfig.worker) {
replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas, replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas,
pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources); pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources);
} }
replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas, replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas,
pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources); pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources);
replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {'pytorchReplicaSpecs': replicaSpecsObj}) replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {'pytorchReplicaSpecs': replicaSpecsObj})
} }
return { return {
apiVersion: `kubeflow.org/${this.kubernetesCRDClient.apiVersion}`, apiVersion: `kubeflow.org/${this.kubernetesCRDClient.apiVersion}`,
kind: this.kubernetesCRDClient.jobKind, kind: this.kubernetesCRDClient.jobKind,
metadata: { metadata: {
name: kubeflowJobName, name: kubeflowJobName,
namespace: 'default', namespace: 'default',
labels: { labels: {
...@@ -395,7 +395,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -395,7 +395,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
} }
}, },
spec: replicaSpecsObjMap.get(this.kubernetesCRDClient.jobKind) spec: replicaSpecsObjMap.get(this.kubernetesCRDClient.jobKind)
}; };
} }
/** /**
......
...@@ -39,7 +39,7 @@ class GeneralK8sClient { ...@@ -39,7 +39,7 @@ class GeneralK8sClient {
} }
public async createSecret(secretManifest: any): Promise<boolean> { public async createSecret(secretManifest: any): Promise<boolean> {
let result: Promise<boolean>; let result: Promise<boolean>;
const response : any = await this.client.api.v1.namespaces('default').secrets.post({body: secretManifest}); const response : any = await this.client.api.v1.namespaces('default').secrets.post({body: secretManifest});
if(response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) { if(response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) {
result = Promise.resolve(true); result = Promise.resolve(true);
...@@ -65,8 +65,8 @@ abstract class KubernetesCRDClient { ...@@ -65,8 +65,8 @@ abstract class KubernetesCRDClient {
public abstract get containerName(): string; public abstract get containerName(): string;
public get jobKind(): string { public get jobKind(): string {
if(this.crdSchema if(this.crdSchema
&& this.crdSchema.spec && this.crdSchema.spec
&& this.crdSchema.spec.names && this.crdSchema.spec.names
&& this.crdSchema.spec.names.kind) { && this.crdSchema.spec.names.kind) {
return this.crdSchema.spec.names.kind; return this.crdSchema.spec.names.kind;
...@@ -76,15 +76,15 @@ abstract class KubernetesCRDClient { ...@@ -76,15 +76,15 @@ abstract class KubernetesCRDClient {
} }
public get apiVersion(): string { public get apiVersion(): string {
if(this.crdSchema if(this.crdSchema
&& this.crdSchema.spec && this.crdSchema.spec
&& this.crdSchema.spec.version) { && this.crdSchema.spec.version) {
return this.crdSchema.spec.version; return this.crdSchema.spec.version;
} else { } else {
throw new Error('KubeflowOperatorClient: get apiVersion failed, version is undefined in crd schema!'); throw new Error('KubeflowOperatorClient: get apiVersion failed, version is undefined in crd schema!');
} }
} }
public async createKubernetesJob(jobManifest: any): Promise<boolean> { public async createKubernetesJob(jobManifest: any): Promise<boolean> {
let result: Promise<boolean>; let result: Promise<boolean>;
const response : any = await this.operator.post({body: jobManifest}); const response : any = await this.operator.post({body: jobManifest});
...@@ -117,7 +117,7 @@ abstract class KubernetesCRDClient { ...@@ -117,7 +117,7 @@ abstract class KubernetesCRDClient {
qs: { qs: {
labelSelector: matchQuery, labelSelector: matchQuery,
propagationPolicy: "Background" propagationPolicy: "Background"
} }
}); });
if(deleteResult.statusCode && deleteResult.statusCode >= 200 && deleteResult.statusCode <= 299) { if(deleteResult.statusCode && deleteResult.statusCode >= 200 && deleteResult.statusCode <= 299) {
result = Promise.resolve(true); result = Promise.resolve(true);
......
...@@ -25,7 +25,7 @@ import { MethodNotImplementedError } from '../../common/errors'; ...@@ -25,7 +25,7 @@ import { MethodNotImplementedError } from '../../common/errors';
export abstract class KubernetesClusterConfig { export abstract class KubernetesClusterConfig {
public readonly storage?: KubernetesStorageKind; public readonly storage?: KubernetesStorageKind;
public readonly apiVersion: string; public readonly apiVersion: string;
constructor(apiVersion: string, storage?: KubernetesStorageKind) { constructor(apiVersion: string, storage?: KubernetesStorageKind) {
this.storage = storage; this.storage = storage;
this.apiVersion = apiVersion; this.apiVersion = apiVersion;
...@@ -48,7 +48,7 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig { ...@@ -48,7 +48,7 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
public readonly nfs: NFSConfig; public readonly nfs: NFSConfig;
constructor( constructor(
apiVersion: string, apiVersion: string,
nfs: NFSConfig, nfs: NFSConfig,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind
) { ) {
...@@ -73,11 +73,11 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig { ...@@ -73,11 +73,11 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
export class KubernetesClusterConfigAzure extends KubernetesClusterConfig { export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
public readonly keyVault: keyVaultConfig; public readonly keyVault: keyVaultConfig;
public readonly azureStorage: AzureStorage; public readonly azureStorage: AzureStorage;
constructor( constructor(
apiVersion: string, apiVersion: string,
keyVault: keyVaultConfig, keyVault: keyVaultConfig,
azureStorage: AzureStorage, azureStorage: AzureStorage,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind
) { ) {
super(apiVersion, storage); super(apiVersion, storage);
...@@ -151,7 +151,7 @@ export class keyVaultConfig { ...@@ -151,7 +151,7 @@ export class keyVaultConfig {
export class AzureStorage { export class AzureStorage {
/**The azure share to storage files */ /**The azure share to storage files */
public readonly azureShare : string; public readonly azureShare : string;
/**The account name of sotrage service */ /**The account name of sotrage service */
public readonly accountName: string; public readonly accountName: string;
constructor(azureShare : string, accountName: string){ constructor(azureShare : string, accountName: string){
...@@ -178,8 +178,8 @@ export class KubernetesTrialConfigTemplate { ...@@ -178,8 +178,8 @@ export class KubernetesTrialConfigTemplate {
/** Required GPU number for trial job. The number should be in [0,100] */ /** Required GPU number for trial job. The number should be in [0,100] */
public readonly gpuNum : number; public readonly gpuNum : number;
constructor(command : string, gpuNum : number, constructor(command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) { cpuNum: number, memoryMB: number, image: string) {
this.command = command; this.command = command;
this.gpuNum = gpuNum; this.gpuNum = gpuNum;
......
...@@ -40,7 +40,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail { ...@@ -40,7 +40,7 @@ export class KubernetesTrialJobDetail implements TrialJobDetail {
public queryJobFailedCount: number; public queryJobFailedCount: number;
constructor(id: string, status: TrialJobStatus, submitTime: number, constructor(id: string, status: TrialJobStatus, submitTime: number,
workingDirectory: string, form: JobApplicationForm, workingDirectory: string, form: JobApplicationForm,
kubernetesJobName: string, sequenceId: number, url: string) { kubernetesJobName: string, sequenceId: number, url: string) {
this.id = id; this.id = id;
this.status = status; this.status = status;
......
...@@ -57,7 +57,7 @@ export class KubernetesJobInfoCollector { ...@@ -57,7 +57,7 @@ export class KubernetesJobInfoCollector {
await Promise.all(updateKubernetesTrialJobs); await Promise.all(updateKubernetesTrialJobs);
} }
protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined, protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> { kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> {
throw new MethodNotImplementedError(); throw new MethodNotImplementedError();
} }
......
...@@ -26,7 +26,7 @@ import { ClusterJobRestServer } from '../common/clusterJobRestServer' ...@@ -26,7 +26,7 @@ import { ClusterJobRestServer } from '../common/clusterJobRestServer'
/** /**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update * Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
* *
*/ */
@component.Singleton @component.Singleton
export class KubernetesJobRestServer extends ClusterJobRestServer{ export class KubernetesJobRestServer extends ClusterJobRestServer{
...@@ -53,5 +53,5 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{ ...@@ -53,5 +53,5 @@ export class KubernetesJobRestServer extends ClusterJobRestServer{
data : singleMetric data : singleMetric
}); });
} }
} }
} }
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment