Unverified Commit 36dbc0fe authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

support frameworkcontroller training service (#484)

Add frameworkcontroller training service based on kubeflow training service.
Refactor code structure, add kubernetes training service as father class, and set kubeflow training service and frameworkcontroller training service as child class.
parent 416b8b53
{
"kind": "CustomResourceDefinition",
"spec": {
"scope": "Namespaced",
"version": "v1",
"group": "frameworkcontroller.microsoft.com",
"names": {
"kind": "Framework",
"plural": "frameworks",
"singular": "framework"
}
},
"apiVersion": "apiextensions.k8s.io/v1beta1",
"metadata": {
"name": "frameworks.frameworkcontroller.microsoft.com"
}
}
...@@ -37,7 +37,8 @@ import { ...@@ -37,7 +37,8 @@ import {
RemoteMachineTrainingService RemoteMachineTrainingService
} from './training_service/remote_machine/remoteMachineTrainingService'; } from './training_service/remote_machine/remoteMachineTrainingService';
import { PAITrainingService } from './training_service/pai/paiTrainingService'; import { PAITrainingService } from './training_service/pai/paiTrainingService';
import { KubeflowTrainingService } from './training_service/kubeflow/kubeflowTrainingService'; import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService';
import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService';
function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number) { function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number) {
const createNew: boolean = (startExpMode === 'new'); const createNew: boolean = (startExpMode === 'new');
...@@ -54,7 +55,10 @@ async function initContainer(platformMode: string): Promise<void> { ...@@ -54,7 +55,10 @@ async function initContainer(platformMode: string): Promise<void> {
Container.bind(TrainingService).to(PAITrainingService).scope(Scope.Singleton); Container.bind(TrainingService).to(PAITrainingService).scope(Scope.Singleton);
} else if (platformMode === 'kubeflow') { } else if (platformMode === 'kubeflow') {
Container.bind(TrainingService).to(KubeflowTrainingService).scope(Scope.Singleton); Container.bind(TrainingService).to(KubeflowTrainingService).scope(Scope.Singleton);
} else { } else if (platformMode === 'frameworkcontroller') {
Container.bind(TrainingService).to(FrameworkControllerTrainingService).scope(Scope.Singleton);
}
else {
throw new Error(`Error: unsupported mode: ${mode}`); throw new Error(`Error: unsupported mode: ${mode}`);
} }
Container.bind(Manager).to(NNIManager).scope(Scope.Singleton); Container.bind(Manager).to(NNIManager).scope(Scope.Singleton);
...@@ -66,7 +70,7 @@ async function initContainer(platformMode: string): Promise<void> { ...@@ -66,7 +70,7 @@ async function initContainer(platformMode: string): Promise<void> {
} }
function usage(): void { function usage(): void {
console.info('usage: node main.js --port <port> --mode <local/remote/pai> --start_mode <new/resume> --experiment_id <id>'); console.info('usage: node main.js --port <port> --mode <local/remote/pai/kubeflow/frameworkcontroller> --start_mode <new/resume> --experiment_id <id>');
} }
const strPort: string = parseArg(['--port', '-p']); const strPort: string = parseArg(['--port', '-p']);
...@@ -78,7 +82,7 @@ if (!strPort || strPort.length === 0) { ...@@ -78,7 +82,7 @@ if (!strPort || strPort.length === 0) {
const port: number = parseInt(strPort, 10); const port: number = parseInt(strPort, 10);
const mode: string = parseArg(['--mode', '-m']); const mode: string = parseArg(['--mode', '-m']);
if (!['local', 'remote', 'pai', 'kubeflow'].includes(mode)) { if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller'].includes(mode)) {
console.log(`FATAL: unknown mode: ${mode}`); console.log(`FATAL: unknown mode: ${mode}`);
usage(); usage();
process.exit(1); process.exit(1);
......
...@@ -68,6 +68,20 @@ export namespace ValidationSchemas { ...@@ -68,6 +68,20 @@ export namespace ValidationSchemas {
memoryMB: joi.number().min(100), memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(), gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required() command: joi.string().min(1).required()
}),
taskRoles: joi.array({
name: joi.string().min(1),
taskNum: joi.number().min(1).required(),
image: joi.string().min(1),
outputDir: joi.string(),
cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required(),
frameworkAttemptCompletionPolicy: joi.object({
minFailedTaskCount: joi.number(),
minSucceededTaskCount: joi.number()
})
}) })
}), }),
pai_config: joi.object({ pai_config: joi.object({
...@@ -92,6 +106,21 @@ export namespace ValidationSchemas { ...@@ -92,6 +106,21 @@ export namespace ValidationSchemas {
azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/)
}) })
}), }),
frameworkcontroller_config: joi.object({
storage: joi.string().min(1),
nfs: joi.object({
server: joi.string().min(1).required(),
path: joi.string().min(1).required()
}),
keyVault: joi.object({
vaultName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/),
name: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/)
}),
azureStorage: joi.object({
accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/),
azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/)
})
}),
nni_manager_ip: joi.object({ nni_manager_ip: joi.object({
nniManagerIp: joi.string().min(1) nniManagerIp: joi.string().min(1)
}) })
......
...@@ -30,5 +30,6 @@ export enum TrialConfigMetadataKey { ...@@ -30,5 +30,6 @@ export enum TrialConfigMetadataKey {
RANDOM_SCHEDULER = 'random_scheduler', RANDOM_SCHEDULER = 'random_scheduler',
PAI_CLUSTER_CONFIG = 'pai_config', PAI_CLUSTER_CONFIG = 'pai_config',
KUBEFLOW_CLUSTER_CONFIG = 'kubeflow_config', KUBEFLOW_CLUSTER_CONFIG = 'kubeflow_config',
NNI_MANAGER_IP = 'nni_manager_ip' NNI_MANAGER_IP = 'nni_manager_ip',
FRAMEWORKCONTROLLER_CLUSTER_CONFIG = 'frameworkcontroller_config'
} }
...@@ -195,6 +195,3 @@ export namespace AzureStorageClientUtility { ...@@ -195,6 +195,3 @@ export namespace AzureStorageClientUtility {
return deferred.promise; return deferred.promise;
} }
} }
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as fs from 'fs';
import { KubernetesCRDClient, GeneralK8sClient } from '../kubernetesApiClient';
abstract class FrameworkControllerClient extends KubernetesCRDClient{
/**
* Factory method to generate operator cliet
*/
public static generateFrameworkControllerClient(): KubernetesCRDClient {
return new FrameworkControllerClientV1();
}
}
class FrameworkControllerClientV1 extends FrameworkControllerClient {
/**
* constructor, to initialize frameworkcontroller CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["frameworkcontroller.microsoft.com"].v1.namespaces('default').frameworks;
}
public get containerName(): string {
return 'framework';
}
}
export { FrameworkControllerClient, GeneralK8sClient };
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { KubernetesTrialConfig, KubernetesTrialConfigTemplate } from '../kubernetesConfig'
export class FrameworkAttemptCompletionPolicy {
public readonly minFailedTaskCount: number;
public readonly minSucceededTaskCount: number;
constructor(minFailedTaskCount: number, minSucceededTaskCount: number) {
this.minFailedTaskCount = minFailedTaskCount;
this.minSucceededTaskCount = minSucceededTaskCount;
}
}
/**
* Trial job configuration for FrameworkController
*/
export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfigTemplate{
public readonly frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy;
public readonly name: string;
public readonly taskNum: number;
constructor(taskNum: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string,
frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy) {
super(command, gpuNum, cpuNum, memoryMB, image);
this.frameworkAttemptCompletionPolicy = frameworkAttemptCompletionPolicy;
this.name = name;
this.taskNum = taskNum;
}
}
export class FrameworkControllerTrialConfig extends KubernetesTrialConfig{
public readonly taskRoles: FrameworkControllerTrialConfigTemplate[];
public readonly codeDir: string;
constructor(codeDir: string, taskRoles: FrameworkControllerTrialConfigTemplate[]) {
super(codeDir);
this.taskRoles = taskRoles;
this.codeDir = codeDir;
}
}
export type FrameworkControllerJobStatus = 'AttemptRunning' | 'Completed' | 'AttemptCreationPending' | 'AttemptCreationRequested' | 'AttemptPreparing' | 'AttemptCompleted';
export type FrameworkControllerJobCompleteStatus = 'Succeeded' | 'Failed';
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { KubernetesTrialJobDetail} from '../kubernetesData';
import { KubernetesCRDClient } from '../kubernetesApiClient';
import { KubernetesJobInfoCollector } from '../kubernetesJobInfoCollector';
import { FrameworkControllerJobStatus, FrameworkControllerJobCompleteStatus } from './frameworkcontrollerConfig';
/**
* Collector frameworkcontroller jobs info from Kubernetes cluster, and update frameworkcontroller job status locally
*/
export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollector{
constructor(jobMap: Map<string, KubernetesTrialJobDetail>) {
super(jobMap);
}
protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> {
if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) {
return Promise.resolve();
}
if(kubernetesCRDClient === undefined) {
return Promise.reject('kubernetesCRDClient is undefined');
}
let kubernetesJobInfo: any;
try {
kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName);
} catch(error) {
this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`);
//This is not treat as a error status
return Promise.resolve();
}
if(kubernetesJobInfo.status && kubernetesJobInfo.status.state) {
const frameworkJobType: FrameworkControllerJobStatus = <FrameworkControllerJobStatus>kubernetesJobInfo.status.state;
switch(frameworkJobType) {
case 'AttemptCreationPending' || 'AttemptCreationRequested' || 'AttemptPreparing':
kubernetesTrialJob.status = 'WAITING';
break;
case 'AttemptRunning':
kubernetesTrialJob.status = 'RUNNING';
if(!kubernetesTrialJob.startTime) {
kubernetesTrialJob.startTime = Date.parse(<string>kubernetesJobInfo.status.startTime);
}
break;
case 'Completed':
const completedJobType : FrameworkControllerJobCompleteStatus = <FrameworkControllerJobCompleteStatus>kubernetesJobInfo.status.attemptStatus.completionStatus.type.name;
switch(completedJobType) {
case 'Succeeded':
kubernetesTrialJob.status = 'SUCCEEDED';
break;
case 'Failed':
kubernetesTrialJob.status = 'FAILED';
break;
}
kubernetesTrialJob.endTime = Date.parse(<string>kubernetesJobInfo.status.completionTime);
break;
default:
break;
}
}
return Promise.resolve();
}
}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as component from '../../../common/component';
import { FrameworkControllerTrainingService } from './frameworkcontrollerTrainingService';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
/**
* frameworkcontroller Training service Rest server, provides rest API to support frameworkcontroller job metrics update
*
*/
@component.Singleton
export class FrameworkControllerJobRestServer extends KubernetesJobRestServer{
constructor() {
super(component.get(FrameworkControllerTrainingService));
}
}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict'
import * as component from '../../../common/component';
import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import * as path from 'path';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
import { getExperimentId } from '../../../common/experimentStartupInfo';
import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
import {
JobApplicationForm, TrialJobApplicationForm,
TrialJobDetail, NNIManagerIpConfig
} from '../../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils';
import { NFSConfig, KubernetesClusterConfigNFS, KubernetesClusterConfigAzure, KubernetesClusterConfigFactory } from '../kubernetesConfig'
import { KubernetesTrialJobDetail } from '../kubernetesData';
import { validateCodeDir } from '../../common/util';
import { AzureStorageClientUtility } from '../azureStorageClientUtils';
import { KubernetesTrainingService } from '../kubernetesTrainingService';
import { FrameworkControllerTrialConfig } from './frameworkcontrollerConfig';
import { FrameworkControllerJobRestServer } from './frameworkcontrollerJobRestServer';
import { FrameworkControllerClient } from './frameworkcontrollerApiClient';
import { FrameworkControllerJobInfoCollector } from './frameworkcontrollerJobInfoCollector';
/**
* Training Service implementation for frameworkcontroller
*/
@component.Singleton
class FrameworkControllerTrainingService extends KubernetesTrainingService implements KubernetesTrainingService {
private frameworkcontrollerTrialConfig?: FrameworkControllerTrialConfig;
private frameworkcontrollerJobInfoCollector: FrameworkControllerJobInfoCollector;
constructor() {
super();
this.frameworkcontrollerJobInfoCollector = new FrameworkControllerJobInfoCollector(this.trialJobsMap);
this.experimentId = getExperimentId();
this.nextTrialSequenceId = -1;
}
public async run(): Promise<void> {
this.kubernetesJobRestServer = component.get(FrameworkControllerJobRestServer);
if(!this.kubernetesJobRestServer) {
throw new Error('kubernetesJobRestServer not initialized!');
}
await this.kubernetesJobRestServer.start();
this.log.info(`frameworkcontroller Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`);
while (!this.stopping) {
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
await delay(3000);
await this.frameworkcontrollerJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient);
}
}
public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> {
if(!this.kubernetesClusterConfig) {
throw new Error('frameworkcontrollerClusterConfig is not initialized');
}
if(!this.kubernetesCRDClient) {
throw new Error('kubernetesCRDClient is undefined');
}
if(!this.kubernetesRestServerPort) {
const restServer: FrameworkControllerJobRestServer = component.get(FrameworkControllerJobRestServer);
this.kubernetesRestServerPort = restServer.clusterRestServerPort;
}
const trialJobId: string = uniqueString(5);
const curTrialSequenceId: number = this.generateSequenceId();
// Set trial's NFS working folder
const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId);
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
const frameworkcontrollerJobName = `nniexp${this.experimentId}trial${trialJobId}`.toLowerCase();
await this.prepareRunScript(trialLocalTempFolder, curTrialSequenceId, trialJobId, trialWorkingFolder, form);
//upload code files
let trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder);
const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail(
trialJobId,
'WAITING',
Date.now(),
trialWorkingFolder,
form,
frameworkcontrollerJobName,
curTrialSequenceId,
trialJobOutputUrl
);
// Set trial job detail until create frameworkcontroller job successfully
this.trialJobsMap.set(trialJobId, trialJobDetail);
// Create frameworkcontroller job based on generated frameworkcontroller job resource config
const frameworkcontrollerJobConfig = await this.prepareFrameworkControllerConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName);
await this.kubernetesCRDClient.createKubernetesJob(frameworkcontrollerJobConfig);
// Set trial job detail until create frameworkcontroller job successfully
this.trialJobsMap.set(trialJobId, trialJobDetail);
return Promise.resolve(trialJobDetail);
}
/**
* upload code files to nfs or azureStroage
* @param trialJobId
* @param trialLocalTempFolder
* return: trialJobOutputUrl
*/
private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise<string> {
if(!this.kubernetesClusterConfig) {
throw new Error('Kubeflow Cluster config is not initialized');
}
let trialJobOutputUrl: string = '';
if(this.kubernetesClusterConfig.storageType === 'azureStorage') {
try{
//upload local files to azure storage
await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`);
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}`
}catch(error){
this.log.error(error);
return Promise.reject(error);
}
} else if(this.kubernetesClusterConfig.storageType === 'nfs') {
let nfsFrameworkControllerClusterConfig: KubernetesClusterConfigNFS = <KubernetesClusterConfigNFS>this.kubernetesClusterConfig;
// Creat work dir for current trial in NFS directory
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`);
// Copy code files from local dir to NFS mounted dir
await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
const nfsConfig: NFSConfig = nfsFrameworkControllerClusterConfig.nfs;
trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`
}
return Promise.resolve(trialJobOutputUrl);
}
private async prepareRunScript(trialLocalTempFolder: string, curTrialSequenceId: number, trialJobId: string, trialWorkingFolder: string, form: JobApplicationForm): Promise<void> {
if(!this.frameworkcontrollerTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized');
}
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${this.frameworkcontrollerTrialConfig.codeDir} ${trialLocalTempFolder}`);
const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });
// Create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
for(let taskRole of this.frameworkcontrollerTrialConfig.taskRoles) {
const runScriptContent: string = this.generateRunScript('frameworkcontroller', trialJobId, trialWorkingFolder,
taskRole.command, curTrialSequenceId.toString(), taskRole.name, taskRole.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, `run_${taskRole.name}.sh`), runScriptContent, { encoding: 'utf8' });
}
// Write file content ( parameter.cfg ) to local tmp folders
const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form)
if(trialForm && trialForm.hyperParameters) {
await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)),
trialForm.hyperParameters.value, { encoding: 'utf8' });
}
}
private async prepareFrameworkControllerConfig(trialJobId: string, trialWorkingFolder: string, frameworkcontrollerJobName: string): Promise<any> {
if(!this.frameworkcontrollerTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized');
}
const podResources : any = [];
for(let taskRole of this.frameworkcontrollerTrialConfig.taskRoles) {
let resource: any = {};
resource.requests = this.generatePodResource(taskRole.memoryMB, taskRole.cpuNum, taskRole.gpuNum);
resource.limits = Object.assign({}, resource.requests);
podResources.push(resource);
}
// Generate frameworkcontroller job resource config object
const frameworkcontrollerJobConfig: any = this.generateFrameworkControllerJobConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName, podResources);
return Promise.resolve(frameworkcontrollerJobConfig);
}
public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) {
case TrialConfigMetadataKey.NNI_MANAGER_IP:
this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value);
break;
case TrialConfigMetadataKey.FRAMEWORKCONTROLLER_CLUSTER_CONFIG:
let frameworkcontrollerClusterJsonObject = JSON.parse(value);
this.kubernetesClusterConfig = KubernetesClusterConfigFactory.generateKubernetesClusterConfig(frameworkcontrollerClusterJsonObject);
if(this.kubernetesClusterConfig.storageType === 'azureStorage') {
let azureFrameworkControllerClusterConfig = <KubernetesClusterConfigAzure>this.kubernetesClusterConfig;
this.azureStorageAccountName = azureFrameworkControllerClusterConfig.azureStorage.accountName;
this.azureStorageShare = azureFrameworkControllerClusterConfig.azureStorage.azureShare;
await this.createAzureStorage(
azureFrameworkControllerClusterConfig.keyVault.vaultName,
azureFrameworkControllerClusterConfig.keyVault.name,
azureFrameworkControllerClusterConfig.azureStorage.accountName,
azureFrameworkControllerClusterConfig.azureStorage.azureShare
);
} else if(this.kubernetesClusterConfig.storageType === 'nfs') {
let nfsFrameworkControllerClusterConfig = <KubernetesClusterConfigNFS>this.kubernetesClusterConfig;
await this.createNFSStorage(
nfsFrameworkControllerClusterConfig.nfs.server,
nfsFrameworkControllerClusterConfig.nfs.path
);
}
this.kubernetesCRDClient = FrameworkControllerClient.generateFrameworkControllerClient();
break;
case TrialConfigMetadataKey.TRIAL_CONFIG:
let frameworkcontrollerTrialJsonObjsect = JSON.parse(value);
this.frameworkcontrollerTrialConfig = new FrameworkControllerTrialConfig(
frameworkcontrollerTrialJsonObjsect.codeDir,
frameworkcontrollerTrialJsonObjsect.taskRoles
);
// Validate to make sure codeDir doesn't have too many files
try {
await validateCodeDir(this.frameworkcontrollerTrialConfig.codeDir);
} catch(error) {
this.log.error(error);
return Promise.reject(new Error(error));
}
break;
default:
break;
}
return Promise.resolve();
}
/**
* Generate frameworkcontroller resource config file
* @param trialJobId trial job id
* @param trialWorkingFolder working folder
* @param frameworkcontrollerJobName job name
* @param podResources pod template
*/
private generateFrameworkControllerJobConfig(trialJobId: string, trialWorkingFolder: string, frameworkcontrollerJobName : string, podResources : any) : any {
if(!this.kubernetesClusterConfig) {
throw new Error('frameworkcontroller Cluster config is not initialized');
}
if(!this.frameworkcontrollerTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized');
}
let taskRoles = [];
for(let index in this.frameworkcontrollerTrialConfig.taskRoles) {
let taskRole = this.generateTaskRoleConfig(
trialWorkingFolder,
this.frameworkcontrollerTrialConfig.taskRoles[index].image,
`run_${this.frameworkcontrollerTrialConfig.taskRoles[index].name}.sh`,
podResources[index]
);
taskRoles.push({
name: this.frameworkcontrollerTrialConfig.taskRoles[index].name,
taskNumber: this.frameworkcontrollerTrialConfig.taskRoles[index].taskNum,
frameworkAttemptCompletionPolicy: {
minFailedTaskCount: this.frameworkcontrollerTrialConfig.taskRoles[index].frameworkAttemptCompletionPolicy.minFailedTaskCount,
minSucceededTaskCount: this.frameworkcontrollerTrialConfig.taskRoles[index].frameworkAttemptCompletionPolicy.minSucceededTaskCount
},
task: taskRole
});
}
return {
apiVersion: `frameworkcontroller.microsoft.com/v1`,
kind: 'Framework',
metadata: {
name: frameworkcontrollerJobName,
namespace: 'default',
labels: {
app: this.NNI_KUBERNETES_TRIAL_LABEL,
expId: getExperimentId(),
trialId: trialJobId
}
},
spec: {
executionType: 'Start',
taskRoles: taskRoles
}
};
}
private generateTaskRoleConfig(trialWorkingFolder: string, replicaImage: string, runScriptFile: string, podResources: any): any {
if(!this.kubernetesClusterConfig) {
throw new Error('frameworkcontroller Cluster config is not initialized');
}
if(!this.frameworkcontrollerTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized');
}
let volumeSpecMap = new Map<string, object>();
if(this.kubernetesClusterConfig.storageType === 'azureStorage'){
volumeSpecMap.set('nniVolumes', [
{
name: 'nni-vol',
azureFile: {
secretName: `${this.azureStorageSecretName}`,
shareName: `${this.azureStorageShare}`,
readonly: false
}
}])
}else {
let frameworkcontrollerClusterConfigNFS: KubernetesClusterConfigNFS = <KubernetesClusterConfigNFS> this.kubernetesClusterConfig;
volumeSpecMap.set('nniVolumes', [
{
name: 'nni-vol',
nfs: {
server: `${frameworkcontrollerClusterConfigNFS.nfs.server}`,
path: `${frameworkcontrollerClusterConfigNFS.nfs.path}`
}
}])
}
let taskRole = {
pod: {
spec: {
containers: [
{
name: 'framework',
image: replicaImage,
args: ["sh", `${path.join(trialWorkingFolder, runScriptFile)}`],
volumeMounts: [
{
name: 'nni-vol',
mountPath: this.CONTAINER_MOUNT_PATH
}],
resources: podResources
}],
restartPolicy: 'OnFailure',
volumes: volumeSpecMap.get('nniVolumes')
}
}
}
return taskRole;
}
}
export { FrameworkControllerTrainingService }
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as fs from 'fs';
import { KubeflowOperator } from './kubeflowConfig';
import { KubernetesCRDClient, GeneralK8sClient } from '../kubernetesApiClient';
abstract class KubeflowOperatorClient extends KubernetesCRDClient{
/**
* Factory method to generate operator cliet
*/
public static generateOperatorClient(kubeflowOperator: KubeflowOperator,
operatorApiVersion: string): KubernetesCRDClient {
if(kubeflowOperator === 'tf-operator') {
if(operatorApiVersion == 'v1alpha2') {
return new TFOperatorClientV1Alpha2();
} else if(operatorApiVersion == 'v1beta1') {
return new TFOperatorClientV1Beta1();
}
} else if(kubeflowOperator === 'pytorch-operator') {
if(operatorApiVersion == 'v1alpha2') {
return new PytorchOperatorClientV1Alpha2();
} else if(operatorApiVersion == 'v1beta1') {
return new PytorchOperatorClientV1Beta1();
}
}
throw new Error(`Invalid operator ${kubeflowOperator} or apiVersion ${operatorApiVersion}`);
}
}
class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/tfjob-crd-v1alpha2.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1alpha2.namespaces('default').tfjobs;
}
public get containerName(): string {
return 'tensorflow';
}
}
class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/tfjob-crd-v1beta1.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta1.namespaces('default').tfjobs;
}
public get containerName(): string {
return 'tensorflow';
}
}
class PytorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/pytorchjob-crd-v1alpha2.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1alpha2.namespaces('default').pytorchjobs;
}
public get containerName(): string {
return 'pytorch';
}
}
class PytorchOperatorClientV1Beta1 extends KubernetesCRDClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/pytorchjob-crd-v1beta1.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta1.namespaces('default').pytorchjobs;
}
public get containerName(): string {
return 'pytorch';
}
}
export { KubeflowOperatorClient, GeneralK8sClient };
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as assert from 'assert';
import { KubernetesClusterConfigAzure, KubernetesClusterConfigNFS, KubernetesStorageKind, NFSConfig, AzureStorage, keyVaultConfig,
KubernetesTrialConfig, KubernetesTrialConfigTemplate, StorageConfig, KubernetesClusterConfig } from '../kubernetesConfig'
import { MethodNotImplementedError } from '../../../common/errors';
/** operator types that kubeflow supported */
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ;
export type DistTrainRole = 'worker' | 'ps' | 'master';
export type KubeflowJobStatus = 'Created' | 'Running' | 'Failed' | 'Succeeded';
export type OperatorApiVersion = 'v1alpha2' | 'v1beta1';
export class KubeflowClusterConfig extends KubernetesClusterConfig {
public readonly operator: KubeflowOperator;
constructor(codeDir: string, operator: KubeflowOperator) {
super(codeDir);
this.operator = operator;
}
}
export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
public readonly operator: KubeflowOperator;
constructor(
operator: KubeflowOperator,
apiVersion: string,
nfs: NFSConfig,
storage?: KubernetesStorageKind
) {
super(apiVersion, nfs, storage);
this.operator = operator;
}
public get storageType(): KubernetesStorageKind {
return 'nfs';
}
public static getInstance(jsonObject: object): KubeflowClusterConfigNFS {
let kubeflowClusterConfigObjectNFS = <KubeflowClusterConfigNFS>jsonObject;
assert (kubeflowClusterConfigObjectNFS !== undefined)
return new KubeflowClusterConfigNFS(
kubeflowClusterConfigObjectNFS.operator,
kubeflowClusterConfigObjectNFS.apiVersion,
kubeflowClusterConfigObjectNFS.nfs,
kubeflowClusterConfigObjectNFS.storage
);
}
}
export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure{
public readonly operator: KubeflowOperator;
constructor(
operator: KubeflowOperator,
apiVersion: string,
keyVault: keyVaultConfig,
azureStorage: AzureStorage,
storage?: KubernetesStorageKind
) {
super(apiVersion, keyVault, azureStorage,storage);
this.operator = operator;
}
public get storageType(): KubernetesStorageKind{
return 'azureStorage';
}
public static getInstance(jsonObject: object): KubeflowClusterConfigAzure {
let kubeflowClusterConfigObjectAzure = <KubeflowClusterConfigAzure>jsonObject;
return new KubeflowClusterConfigAzure(
kubeflowClusterConfigObjectAzure.operator,
kubeflowClusterConfigObjectAzure.apiVersion,
kubeflowClusterConfigObjectAzure.keyVault,
kubeflowClusterConfigObjectAzure.azureStorage,
kubeflowClusterConfigObjectAzure.storage
);
}
}
export class KubeflowClusterConfigFactory {
public static generateKubeflowClusterConfig(jsonObject: object): KubeflowClusterConfig {
let storageConfig = <StorageConfig>jsonObject;
if(!storageConfig) {
throw new Error("Invalid json object as a StorageConfig instance");
}
if(storageConfig.storage && storageConfig.storage === 'azureStorage') {
return KubeflowClusterConfigAzure.getInstance(jsonObject);
} else if (storageConfig.storage === undefined || storageConfig.storage === 'nfs') {
return KubeflowClusterConfigNFS.getInstance(jsonObject);
}
throw new Error(`Invalid json object ${jsonObject}`);
}
}
export class KubeflowTrialConfig extends KubernetesTrialConfig {
constructor(codeDir: string) {
super(codeDir);
}
public get operatorType(): KubeflowOperator {
throw new MethodNotImplementedError();
}
}
export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate{
public readonly replicas: number;
constructor(replicas: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) {
super(command, gpuNum, cpuNum, memoryMB, image);
this.replicas = replicas;
}
}
export class KubeflowTrialConfigTensorflow extends KubeflowTrialConfig {
public readonly ps?: KubeflowTrialConfigTemplate;
public readonly worker: KubeflowTrialConfigTemplate;
constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, ps?: KubeflowTrialConfigTemplate) {
super(codeDir);
this.ps = ps;
this.worker = worker;
}
public get operatorType(): KubeflowOperator {
return 'tf-operator';
}
}
export class KubeflowTrialConfigPytorch extends KubeflowTrialConfig {
public readonly master: KubeflowTrialConfigTemplate;
public readonly worker?: KubeflowTrialConfigTemplate;
constructor(codeDir: string, master: KubeflowTrialConfigTemplate, worker?: KubeflowTrialConfigTemplate) {
super(codeDir);
this.master = master;
this.worker = worker;
}
public get operatorType(): KubeflowOperator {
return 'pytorch-operator';
}
}
export class KubeflowTrialConfigFactory {
public static generateKubeflowTrialConfig(jsonObject: object, operator: KubeflowOperator): KubeflowTrialConfig {
if(operator === 'tf-operator'){
let kubeflowTrialConfigObject = <KubeflowTrialConfigTensorflow>jsonObject;
return new KubeflowTrialConfigTensorflow(
kubeflowTrialConfigObject.codeDir,
kubeflowTrialConfigObject.worker,
kubeflowTrialConfigObject.ps
);
}else if(operator === 'pytorch-operator'){
let kubeflowTrialConfigObject = <KubeflowTrialConfigPytorch>jsonObject;
return new KubeflowTrialConfigPytorch(
kubeflowTrialConfigObject.codeDir,
kubeflowTrialConfigObject.master,
kubeflowTrialConfigObject.worker
);
}
throw new Error(`Invalid json object ${jsonObject}`);
}
}
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { KubernetesTrialJobDetail} from '../kubernetesData';
import { KubernetesCRDClient } from '../kubernetesApiClient';
import { KubernetesJobInfoCollector } from '../kubernetesJobInfoCollector';
import { KubeflowJobStatus } from './kubeflowConfig';
/**
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
*/
export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
constructor(jobMap: Map<string, KubernetesTrialJobDetail>) {
super(jobMap);
}
protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> {
if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) {
return Promise.resolve();
}
if(kubernetesCRDClient === undefined) {
return Promise.reject('kubernetesCRDClient is undefined');
}
let kubernetesJobInfo: any;
try {
kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName);
} catch(error) {
this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`);
//This is not treat as a error status
return Promise.resolve();
}
if(kubernetesJobInfo.status && kubernetesJobInfo.status.conditions) {
const latestCondition = kubernetesJobInfo.status.conditions[kubernetesJobInfo.status.conditions.length - 1];
const tfJobType : KubeflowJobStatus = <KubeflowJobStatus>latestCondition.type;
switch(tfJobType) {
case 'Created':
kubernetesTrialJob.status = 'WAITING';
kubernetesTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
case 'Running':
kubernetesTrialJob.status = 'RUNNING';
if(!kubernetesTrialJob.startTime) {
kubernetesTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
}
break;
case 'Failed':
kubernetesTrialJob.status = 'FAILED';
kubernetesTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
case 'Succeeded':
kubernetesTrialJob.status = 'SUCCEEDED';
kubernetesTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
default:
break;
}
}
return Promise.resolve();
}
}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as component from '../../../common/component';
import { KubeflowTrainingService } from './kubeflowTrainingService';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*/
@component.Singleton
export class KubeflowJobRestServer extends KubernetesJobRestServer{
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
constructor() {
super(component.get(KubeflowTrainingService));
}
}
\ No newline at end of file
...@@ -19,11 +19,9 @@ ...@@ -19,11 +19,9 @@
'use strict'; 'use strict';
import * as fs from 'fs';
import * as os from 'os' import * as os from 'os'
import * as path from 'path'; import * as path from 'path';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { KubeflowOperator, OperatorApiVersion } from './kubeflowConfig';
var K8SClient = require('kubernetes-client').Client; var K8SClient = require('kubernetes-client').Client;
var K8SConfig = require('kubernetes-client').config; var K8SConfig = require('kubernetes-client').config;
...@@ -52,7 +50,7 @@ class GeneralK8sClient { ...@@ -52,7 +50,7 @@ class GeneralK8sClient {
} }
} }
abstract class KubeflowOperatorClient { abstract class KubernetesCRDClient {
protected readonly client: any; protected readonly client: any;
protected readonly log: Logger = getLogger(); protected readonly log: Logger = getLogger();
protected crdSchema: any; protected crdSchema: any;
...@@ -66,28 +64,6 @@ abstract class KubeflowOperatorClient { ...@@ -66,28 +64,6 @@ abstract class KubeflowOperatorClient {
public abstract get containerName(): string; public abstract get containerName(): string;
/**
* Factory method to generate operator cliet
*/
public static generateOperatorClient(kubeflowOperator: KubeflowOperator,
operatorApiVersion: OperatorApiVersion): KubeflowOperatorClient {
if(kubeflowOperator === 'tf-operator') {
if(operatorApiVersion == 'v1alpha2') {
return new TFOperatorClientV1Alpha2();
} else if(operatorApiVersion == 'v1beta1') {
return new TFOperatorClientV1Beta1();
}
} else if(kubeflowOperator === 'pytorch-operator') {
if(operatorApiVersion == 'v1alpha2') {
return new PytorchOperatorClientV1Alpha2();
} else if(operatorApiVersion == 'v1beta1') {
return new PytorchOperatorClientV1Beta1();
}
}
throw new Error(`Invalid operator ${kubeflowOperator} or apiVersion ${operatorApiVersion}`);
}
public get jobKind(): string { public get jobKind(): string {
if(this.crdSchema if(this.crdSchema
&& this.crdSchema.spec && this.crdSchema.spec
...@@ -109,19 +85,19 @@ abstract class KubeflowOperatorClient { ...@@ -109,19 +85,19 @@ abstract class KubeflowOperatorClient {
} }
} }
public async createKubeflowJob(jobManifest: any): Promise<boolean> { public async createKubernetesJob(jobManifest: any): Promise<boolean> {
let result: Promise<boolean>; let result: Promise<boolean>;
const response : any = await this.operator.post({body: jobManifest}); const response : any = await this.operator.post({body: jobManifest});
if(response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) { if(response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) {
result = Promise.resolve(true); result = Promise.resolve(true);
} else { } else {
result = Promise.reject(`KubeflowOperatorClient create tfjobs failed, statusCode is ${response.statusCode}`); result = Promise.reject(`Create kubernetes job failed, statusCode is ${response.statusCode}`);
} }
return result; return result;
} }
//TODO : replace any //TODO : replace any
public async getKubeflowJob(kubeflowJobName: string): Promise<any> { public async getKubernetesJob(kubeflowJobName: string): Promise<any> {
let result: Promise<any>; let result: Promise<any>;
const response : any = await this.operator(kubeflowJobName).get(); const response : any = await this.operator(kubeflowJobName).get();
if(response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) { if(response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) {
...@@ -132,12 +108,17 @@ abstract class KubeflowOperatorClient { ...@@ -132,12 +108,17 @@ abstract class KubeflowOperatorClient {
return result; return result;
} }
public async deleteKubeflowJob(labels: Map<string, string>): Promise<boolean> { public async deleteKubernetesJob(labels: Map<string, string>): Promise<boolean> {
let result: Promise<boolean>; let result: Promise<boolean>;
// construct match query from labels for deleting tfjob // construct match query from labels for deleting tfjob
const matchQuery: string = Array.from(labels.keys()).map(labelKey => `${labelKey}=${labels.get(labelKey)}`).join(','); const matchQuery: string = Array.from(labels.keys()).map(labelKey => `${labelKey}=${labels.get(labelKey)}`).join(',');
try { try {
const deleteResult : any = await this.operator().delete({ qs: { labelSelector: matchQuery } }); const deleteResult : any = await this.operator().delete({
qs: {
labelSelector: matchQuery,
propagationPolicy: "Background"
}
});
if(deleteResult.statusCode && deleteResult.statusCode >= 200 && deleteResult.statusCode <= 299) { if(deleteResult.statusCode && deleteResult.statusCode >= 200 && deleteResult.statusCode <= 299) {
result = Promise.resolve(true); result = Promise.resolve(true);
} else { } else {
...@@ -151,80 +132,4 @@ abstract class KubeflowOperatorClient { ...@@ -151,80 +132,4 @@ abstract class KubeflowOperatorClient {
} }
} }
class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient { export { KubernetesCRDClient, GeneralK8sClient };
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/tfjob-crd-v1alpha2.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1alpha2.namespaces('default').tfjobs;
}
public get containerName(): string {
return 'tensorflow';
}
}
class TFOperatorClientV1Beta1 extends KubeflowOperatorClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/tfjob-crd-v1beta1.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta1.namespaces('default').tfjobs;
}
public get containerName(): string {
return 'tensorflow';
}
}
class PytorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/pytorchjob-crd-v1alpha2.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1alpha2.namespaces('default').pytorchjobs;
}
public get containerName(): string {
return 'pytorch';
}
}
class PytorchOperatorClientV1Beta1 extends KubeflowOperatorClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/pytorchjob-crd-v1beta1.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta1.namespaces('default').pytorchjobs;
}
public get containerName(): string {
return 'pytorch';
}
}
export { KubeflowOperatorClient, GeneralK8sClient };
...@@ -19,59 +19,99 @@ ...@@ -19,59 +19,99 @@
'use strict'; 'use strict';
/** operator types that kubeflow supported */ export type KubernetesStorageKind = 'nfs' | 'azureStorage';
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ; import { MethodNotImplementedError } from '../../common/errors';
export type KubeflowStorageKind = 'nfs' | 'azureStorage';
export type DistTrainRole = 'worker' | 'ps' | 'master';
export type OperatorApiVersion = 'v1alpha2' | 'v1beta1';
/** export abstract class KubernetesClusterConfig {
* Kuberflow cluster configuration public readonly storage?: KubernetesStorageKind;
* public readonly apiVersion: string;
*/
export class KubeflowClusterConfigBase {
/** Name of Kubeflow operator, like tf-operator */
public readonly operator: KubeflowOperator;
public readonly apiVersion: OperatorApiVersion;
public readonly storage?: KubeflowStorageKind;
/** constructor(apiVersion: string, storage?: KubernetesStorageKind) {
* Constructor this.storage = storage;
* @param userName User name of Kubeflow Cluster
* @param passWord password of Kubeflow Cluster
* @param host Host IP of Kubeflow Cluster
*/
constructor(operator: KubeflowOperator, apiVersion: OperatorApiVersion, storage?: KubeflowStorageKind) {
this.operator = operator;
this.apiVersion = apiVersion; this.apiVersion = apiVersion;
}
public get storageType(): KubernetesStorageKind{
throw new MethodNotImplementedError();
}
}
export class StorageConfig {
public readonly storage?: KubernetesStorageKind;
constructor(storage?: KubernetesStorageKind) {
this.storage = storage; this.storage = storage;
} }
} }
export class KubeflowClusterConfigNFS extends KubeflowClusterConfigBase{ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
public readonly nfs: NFSConfig; public readonly nfs: NFSConfig;
constructor(operator: KubeflowOperator, constructor(
apiVersion: OperatorApiVersion, apiVersion: string,
nfs: NFSConfig, storage?: KubeflowStorageKind) { nfs: NFSConfig,
super(operator, apiVersion, storage); storage?: KubernetesStorageKind
) {
super(apiVersion, storage);
this.nfs = nfs; this.nfs = nfs;
} }
public get storageType(): KubernetesStorageKind{
return 'nfs';
}
public static getInstance(jsonObject: object): KubernetesClusterConfigNFS {
let kubernetesClusterConfigObjectNFS = <KubernetesClusterConfigNFS>jsonObject;
return new KubernetesClusterConfigNFS(
kubernetesClusterConfigObjectNFS.apiVersion,
kubernetesClusterConfigObjectNFS.nfs,
kubernetesClusterConfigObjectNFS.storage
);
}
} }
export class KubeflowClusterConfigAzure extends KubeflowClusterConfigBase{ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
public readonly keyVault: keyVaultConfig; public readonly keyVault: keyVaultConfig;
public readonly azureStorage: AzureStorage; public readonly azureStorage: AzureStorage;
constructor(operator: KubeflowOperator, constructor(
apiVersion: OperatorApiVersion, apiVersion: string,
keyVault: keyVaultConfig, keyVault: keyVaultConfig,
azureStorage: AzureStorage, azureStorage: AzureStorage,
storage?: KubeflowStorageKind) { storage?: KubernetesStorageKind
super(operator, apiVersion, storage); ) {
super(apiVersion, storage);
this.keyVault = keyVault; this.keyVault = keyVault;
this.azureStorage = azureStorage; this.azureStorage = azureStorage;
} }
public get storageType(): KubernetesStorageKind{
return 'azureStorage';
}
public static getInstance(jsonObject: object): KubernetesClusterConfigAzure {
let kubernetesClusterConfigObjectAzure = <KubernetesClusterConfigAzure>jsonObject;
return new KubernetesClusterConfigAzure(
kubernetesClusterConfigObjectAzure.apiVersion,
kubernetesClusterConfigObjectAzure.keyVault,
kubernetesClusterConfigObjectAzure.azureStorage,
kubernetesClusterConfigObjectAzure.storage
);
}
}
export class KubernetesClusterConfigFactory {
public static generateKubernetesClusterConfig(jsonObject: object): KubernetesClusterConfig {
let storageConfig = <StorageConfig>jsonObject;
switch(storageConfig.storage) {
case 'azureStorage':
return KubernetesClusterConfigAzure.getInstance(jsonObject);
case 'nfs' || undefined :
return KubernetesClusterConfigNFS.getInstance(jsonObject);
}
throw new Error(`Invalid json object ${jsonObject}`);
}
} }
/** /**
...@@ -121,12 +161,9 @@ export class AzureStorage { ...@@ -121,12 +161,9 @@ export class AzureStorage {
} }
/** /**
* Trial job configuration for Kubeflow * Trial job configuration for Kubernetes
*/ */
export class KubeflowTrialConfigTemplate { export class KubernetesTrialConfigTemplate {
/** replication number of current role */
public readonly replicas: number;
/** CPU number */ /** CPU number */
public readonly cpuNum: number; public readonly cpuNum: number;
...@@ -142,9 +179,8 @@ export class KubeflowTrialConfigTemplate { ...@@ -142,9 +179,8 @@ export class KubeflowTrialConfigTemplate {
/** Required GPU number for trial job. The number should be in [0,100] */ /** Required GPU number for trial job. The number should be in [0,100] */
public readonly gpuNum : number; public readonly gpuNum : number;
constructor(replicas: number, command : string, gpuNum : number, constructor(command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) { cpuNum: number, memoryMB: number, image: string) {
this.replicas = replicas;
this.command = command; this.command = command;
this.gpuNum = gpuNum; this.gpuNum = gpuNum;
this.cpuNum = cpuNum; this.cpuNum = cpuNum;
...@@ -153,33 +189,10 @@ export class KubeflowTrialConfigTemplate { ...@@ -153,33 +189,10 @@ export class KubeflowTrialConfigTemplate {
} }
} }
export class KubeflowTrialConfigBase { export class KubernetesTrialConfig {
public readonly codeDir: string; public readonly codeDir: string;
constructor(codeDir: string) { constructor(codeDir: string) {
this.codeDir = codeDir; this.codeDir = codeDir;
} }
} }
\ No newline at end of file
export class KubeflowTrialConfigTensorflow extends KubeflowTrialConfigBase{
public readonly ps?: KubeflowTrialConfigTemplate;
public readonly worker: KubeflowTrialConfigTemplate;
constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, ps?: KubeflowTrialConfigTemplate) {
super(codeDir);
this.ps = ps;
this.worker = worker;
}
}
export class KubeflowTrialConfigPytorch extends KubeflowTrialConfigBase{
public readonly master: KubeflowTrialConfigTemplate;
public readonly worker?: KubeflowTrialConfigTemplate;
constructor(codeDir: string, master: KubeflowTrialConfigTemplate, worker?: KubeflowTrialConfigTemplate) {
super(codeDir);
this.master = master;
this.worker = worker;
}
}
...@@ -25,7 +25,7 @@ import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../commo ...@@ -25,7 +25,7 @@ import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../commo
* KubeflowTrialJobDetail * KubeflowTrialJobDetail
*/ */
// tslint:disable-next-line:max-classes-per-file // tslint:disable-next-line:max-classes-per-file
export class KubeflowTrialJobDetail implements TrialJobDetail { export class KubernetesTrialJobDetail implements TrialJobDetail {
public id: string; public id: string;
public status: TrialJobStatus; public status: TrialJobStatus;
public submitTime: number; public submitTime: number;
...@@ -35,19 +35,19 @@ export class KubeflowTrialJobDetail implements TrialJobDetail { ...@@ -35,19 +35,19 @@ export class KubeflowTrialJobDetail implements TrialJobDetail {
public url?: string; public url?: string;
public workingDirectory: string; public workingDirectory: string;
public form: JobApplicationForm; public form: JobApplicationForm;
public kubeflowJobName: string; public kubernetesJobName: string;
public sequenceId: number; public sequenceId: number;
public queryJobFailedCount: number; public queryJobFailedCount: number;
constructor(id: string, status: TrialJobStatus, submitTime: number, constructor(id: string, status: TrialJobStatus, submitTime: number,
workingDirectory: string, form: JobApplicationForm, workingDirectory: string, form: JobApplicationForm,
kubeflowJobName: string, sequenceId: number, url: string) { kubernetesJobName: string, sequenceId: number, url: string) {
this.id = id; this.id = id;
this.status = status; this.status = status;
this.submitTime = submitTime; this.submitTime = submitTime;
this.workingDirectory = workingDirectory; this.workingDirectory = workingDirectory;
this.form = form; this.form = form;
this.kubeflowJobName = kubeflowJobName; this.kubernetesJobName = kubernetesJobName;
this.sequenceId = sequenceId; this.sequenceId = sequenceId;
this.tags = []; this.tags = [];
this.queryJobFailedCount = 0; this.queryJobFailedCount = 0;
...@@ -55,4 +55,21 @@ export class KubeflowTrialJobDetail implements TrialJobDetail { ...@@ -55,4 +55,21 @@ export class KubeflowTrialJobDetail implements TrialJobDetail {
} }
} }
export type KubeflowTFJobType = 'Created' | 'Running' | 'Failed' | 'Succeeded'; export const KubernetesScriptFormat =
\ No newline at end of file `#!/bin/bash
export NNI_PLATFORM={0}
export NNI_SYS_DIR=$PWD/nni/{1}
export NNI_OUTPUT_DIR={2}
export MULTI_PHASE=false
export NNI_TRIAL_JOB_ID={3}
export NNI_EXP_ID={4}
export NNI_CODE_DIR={5}
export NNI_TRIAL_SEQ_ID={6}
{7}
mkdir -p $NNI_SYS_DIR
mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} `
+ `1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
...@@ -20,88 +20,45 @@ ...@@ -20,88 +20,45 @@
'use strict'; 'use strict';
import * as assert from 'assert'; import * as assert from 'assert';
import * as cpp from 'child-process-promise';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { KubeflowTrialJobDetail, KubeflowTFJobType} from './kubeflowData';
import { NNIError, NNIErrorNames } from '../../common/errors'; import { NNIError, NNIErrorNames } from '../../common/errors';
import { TrialJobStatus } from '../../common/trainingService'; import { TrialJobStatus } from '../../common/trainingService';
import { KubeflowOperatorClient } from './kubernetesApiClient'; import { KubernetesCRDClient } from './kubernetesApiClient';
import { MethodNotImplementedError } from '../../common/errors';
import { KubernetesTrialJobDetail } from './kubernetesData';
/** /**
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally * Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
*/ */
export class KubeflowJobInfoCollector { export class KubernetesJobInfoCollector {
private readonly trialJobsMap : Map<string, KubeflowTrialJobDetail>; protected readonly trialJobsMap : Map<string, KubernetesTrialJobDetail>;
private readonly log: Logger = getLogger(); protected readonly log: Logger = getLogger();
private readonly statusesNeedToCheck: TrialJobStatus[]; protected readonly statusesNeedToCheck: TrialJobStatus[];
constructor(jobMap: Map<string, KubeflowTrialJobDetail>) { constructor(jobMap: Map<string, KubernetesTrialJobDetail>) {
this.trialJobsMap = jobMap; this.trialJobsMap = jobMap;
this.statusesNeedToCheck = ['RUNNING', 'WAITING']; this.statusesNeedToCheck = ['RUNNING', 'WAITING'];
} }
public async retrieveTrialStatus(operatorClient: KubeflowOperatorClient | undefined) : Promise<void> { public async retrieveTrialStatus(kubernetesCRDClient: KubernetesCRDClient | undefined) : Promise<void> {
assert(operatorClient !== undefined); assert(kubernetesCRDClient !== undefined);
const updateKubeflowTrialJobs : Promise<void>[] = []; const updateKubernetesTrialJobs : Promise<void>[] = [];
for(let [trialJobId, kubeflowTrialJob] of this.trialJobsMap) { for(let [trialJobId, kubernetesTrialJob] of this.trialJobsMap) {
if (!kubeflowTrialJob) { if (!kubernetesTrialJob) {
throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`); throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`);
} }
// Since Kubeflow needs some delay to schedule jobs, we provide 20 seconds buffer time to check kubeflow job's status // Since Kubeflow needs some delay to schedule jobs, we provide 20 seconds buffer time to check kubeflow job's status
if( Date.now() - kubeflowTrialJob.submitTime < 20 * 1000) { if( Date.now() - kubernetesTrialJob.submitTime < 20 * 1000) {
return Promise.resolve(); return Promise.resolve();
} }
updateKubeflowTrialJobs.push(this.retrieveSingleTrialJobInfo(operatorClient, kubeflowTrialJob)) updateKubernetesTrialJobs.push(this.retrieveSingleTrialJobInfo(kubernetesCRDClient, kubernetesTrialJob))
} }
await Promise.all(updateKubeflowTrialJobs); await Promise.all(updateKubernetesTrialJobs);
} }
private async retrieveSingleTrialJobInfo(operatorClient: KubeflowOperatorClient | undefined, protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubeflowTrialJob : KubeflowTrialJobDetail) : Promise<void> { kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> {
if (!this.statusesNeedToCheck.includes(kubeflowTrialJob.status)) { throw new MethodNotImplementedError();
return Promise.resolve();
}
if(operatorClient === undefined) {
return Promise.reject('operatorClient is undefined');
}
let kubeflowJobInfo: any;
try {
kubeflowJobInfo = await operatorClient.getKubeflowJob(kubeflowTrialJob.kubeflowJobName);
} catch(error) {
this.log.error(`Get job ${kubeflowTrialJob.kubeflowJobName} info failed, error is ${error}`);
return Promise.resolve();
}
if(kubeflowJobInfo.status && kubeflowJobInfo.status.conditions) {
const latestCondition = kubeflowJobInfo.status.conditions[kubeflowJobInfo.status.conditions.length - 1];
const tfJobType : KubeflowTFJobType = <KubeflowTFJobType>latestCondition.type;
switch(tfJobType) {
case 'Created':
kubeflowTrialJob.status = 'WAITING';
kubeflowTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
case 'Running':
kubeflowTrialJob.status = 'RUNNING';
if(!kubeflowTrialJob.startTime) {
kubeflowTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
}
break;
case 'Failed':
kubeflowTrialJob.status = 'FAILED';
kubeflowTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
case 'Succeeded':
kubeflowTrialJob.status = 'SUCCEEDED';
kubeflowTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
default:
break;
}
}
return Promise.resolve();
} }
} }
\ No newline at end of file
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
import * as component from '../../common/component'; import * as component from '../../common/component';
import { Inject } from 'typescript-ioc'; import { Inject } from 'typescript-ioc';
import { KubeflowTrainingService } from './kubeflowTrainingService'; import { KubernetesTrainingService } from './kubernetesTrainingService';
import { ClusterJobRestServer } from '../common/clusterJobRestServer' import { ClusterJobRestServer } from '../common/clusterJobRestServer'
/** /**
...@@ -29,23 +29,26 @@ import { ClusterJobRestServer } from '../common/clusterJobRestServer' ...@@ -29,23 +29,26 @@ import { ClusterJobRestServer } from '../common/clusterJobRestServer'
* *
*/ */
@component.Singleton @component.Singleton
export class KubeflowJobRestServer extends ClusterJobRestServer{ export class KubernetesJobRestServer extends ClusterJobRestServer{
@Inject @Inject
private readonly kubeflowTrainingService : KubeflowTrainingService; private kubernetesTrainingService? : KubernetesTrainingService;
/** /**
* constructor to provide NNIRestServer's own rest property, e.g. port * constructor to provide NNIRestServer's own rest property, e.g. port
*/ */
constructor() { constructor(kubernetesTrainingService: KubernetesTrainingService) {
super(); super();
this.kubeflowTrainingService = component.get(KubeflowTrainingService); this.kubernetesTrainingService = kubernetesTrainingService;
} }
protected handleTrialMetrics(jobId : string, metrics : any[]) : void { protected handleTrialMetrics(jobId : string, metrics : any[]) : void {
if(!this.kubernetesTrainingService) {
throw Error('kubernetesTrainingService not initialized!');
}
// Split metrics array into single metric, then emit // Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN // Warning: If not split metrics into single ones, the behavior will be UNKNOWN
for (const singleMetric of metrics) { for (const singleMetric of metrics) {
this.kubeflowTrainingService.MetricsEmitter.emit('metric', { this.kubernetesTrainingService.MetricsEmitter.emit('metric', {
id : jobId, id : jobId,
data : singleMetric data : singleMetric
}); });
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment