Unverified Commit 36dbc0fe authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

support frameworkcontroller training service (#484)

Add frameworkcontroller training service based on kubeflow training service.
Refactor code structure, add kubernetes training service as father class, and set kubeflow training service and frameworkcontroller training service as child class.
parent 416b8b53
{
"kind": "CustomResourceDefinition",
"spec": {
"scope": "Namespaced",
"version": "v1",
"group": "frameworkcontroller.microsoft.com",
"names": {
"kind": "Framework",
"plural": "frameworks",
"singular": "framework"
}
},
"apiVersion": "apiextensions.k8s.io/v1beta1",
"metadata": {
"name": "frameworks.frameworkcontroller.microsoft.com"
}
}
...@@ -37,7 +37,8 @@ import { ...@@ -37,7 +37,8 @@ import {
RemoteMachineTrainingService RemoteMachineTrainingService
} from './training_service/remote_machine/remoteMachineTrainingService'; } from './training_service/remote_machine/remoteMachineTrainingService';
import { PAITrainingService } from './training_service/pai/paiTrainingService'; import { PAITrainingService } from './training_service/pai/paiTrainingService';
import { KubeflowTrainingService } from './training_service/kubeflow/kubeflowTrainingService'; import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService';
import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService';
function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number) { function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number) {
const createNew: boolean = (startExpMode === 'new'); const createNew: boolean = (startExpMode === 'new');
...@@ -54,7 +55,10 @@ async function initContainer(platformMode: string): Promise<void> { ...@@ -54,7 +55,10 @@ async function initContainer(platformMode: string): Promise<void> {
Container.bind(TrainingService).to(PAITrainingService).scope(Scope.Singleton); Container.bind(TrainingService).to(PAITrainingService).scope(Scope.Singleton);
} else if (platformMode === 'kubeflow') { } else if (platformMode === 'kubeflow') {
Container.bind(TrainingService).to(KubeflowTrainingService).scope(Scope.Singleton); Container.bind(TrainingService).to(KubeflowTrainingService).scope(Scope.Singleton);
} else { } else if (platformMode === 'frameworkcontroller') {
Container.bind(TrainingService).to(FrameworkControllerTrainingService).scope(Scope.Singleton);
}
else {
throw new Error(`Error: unsupported mode: ${mode}`); throw new Error(`Error: unsupported mode: ${mode}`);
} }
Container.bind(Manager).to(NNIManager).scope(Scope.Singleton); Container.bind(Manager).to(NNIManager).scope(Scope.Singleton);
...@@ -66,7 +70,7 @@ async function initContainer(platformMode: string): Promise<void> { ...@@ -66,7 +70,7 @@ async function initContainer(platformMode: string): Promise<void> {
} }
function usage(): void { function usage(): void {
console.info('usage: node main.js --port <port> --mode <local/remote/pai> --start_mode <new/resume> --experiment_id <id>'); console.info('usage: node main.js --port <port> --mode <local/remote/pai/kubeflow/frameworkcontroller> --start_mode <new/resume> --experiment_id <id>');
} }
const strPort: string = parseArg(['--port', '-p']); const strPort: string = parseArg(['--port', '-p']);
...@@ -78,7 +82,7 @@ if (!strPort || strPort.length === 0) { ...@@ -78,7 +82,7 @@ if (!strPort || strPort.length === 0) {
const port: number = parseInt(strPort, 10); const port: number = parseInt(strPort, 10);
const mode: string = parseArg(['--mode', '-m']); const mode: string = parseArg(['--mode', '-m']);
if (!['local', 'remote', 'pai', 'kubeflow'].includes(mode)) { if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller'].includes(mode)) {
console.log(`FATAL: unknown mode: ${mode}`); console.log(`FATAL: unknown mode: ${mode}`);
usage(); usage();
process.exit(1); process.exit(1);
......
...@@ -68,6 +68,20 @@ export namespace ValidationSchemas { ...@@ -68,6 +68,20 @@ export namespace ValidationSchemas {
memoryMB: joi.number().min(100), memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(), gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required() command: joi.string().min(1).required()
}),
taskRoles: joi.array({
name: joi.string().min(1),
taskNum: joi.number().min(1).required(),
image: joi.string().min(1),
outputDir: joi.string(),
cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required(),
frameworkAttemptCompletionPolicy: joi.object({
minFailedTaskCount: joi.number(),
minSucceededTaskCount: joi.number()
})
}) })
}), }),
pai_config: joi.object({ pai_config: joi.object({
...@@ -92,6 +106,21 @@ export namespace ValidationSchemas { ...@@ -92,6 +106,21 @@ export namespace ValidationSchemas {
azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/)
}) })
}), }),
frameworkcontroller_config: joi.object({
storage: joi.string().min(1),
nfs: joi.object({
server: joi.string().min(1).required(),
path: joi.string().min(1).required()
}),
keyVault: joi.object({
vaultName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/),
name: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){1,127}$/)
}),
azureStorage: joi.object({
accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/),
azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/)
})
}),
nni_manager_ip: joi.object({ nni_manager_ip: joi.object({
nniManagerIp: joi.string().min(1) nniManagerIp: joi.string().min(1)
}) })
......
...@@ -30,5 +30,6 @@ export enum TrialConfigMetadataKey { ...@@ -30,5 +30,6 @@ export enum TrialConfigMetadataKey {
RANDOM_SCHEDULER = 'random_scheduler', RANDOM_SCHEDULER = 'random_scheduler',
PAI_CLUSTER_CONFIG = 'pai_config', PAI_CLUSTER_CONFIG = 'pai_config',
KUBEFLOW_CLUSTER_CONFIG = 'kubeflow_config', KUBEFLOW_CLUSTER_CONFIG = 'kubeflow_config',
NNI_MANAGER_IP = 'nni_manager_ip' NNI_MANAGER_IP = 'nni_manager_ip',
FRAMEWORKCONTROLLER_CLUSTER_CONFIG = 'frameworkcontroller_config'
} }
...@@ -195,6 +195,3 @@ export namespace AzureStorageClientUtility { ...@@ -195,6 +195,3 @@ export namespace AzureStorageClientUtility {
return deferred.promise; return deferred.promise;
} }
} }
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as fs from 'fs';
import { KubernetesCRDClient, GeneralK8sClient } from '../kubernetesApiClient';
abstract class FrameworkControllerClient extends KubernetesCRDClient{
/**
* Factory method to generate operator cliet
*/
public static generateFrameworkControllerClient(): KubernetesCRDClient {
return new FrameworkControllerClientV1();
}
}
class FrameworkControllerClientV1 extends FrameworkControllerClient {
/**
* constructor, to initialize frameworkcontroller CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["frameworkcontroller.microsoft.com"].v1.namespaces('default').frameworks;
}
public get containerName(): string {
return 'framework';
}
}
export { FrameworkControllerClient, GeneralK8sClient };
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { KubernetesTrialConfig, KubernetesTrialConfigTemplate } from '../kubernetesConfig'
export class FrameworkAttemptCompletionPolicy {
public readonly minFailedTaskCount: number;
public readonly minSucceededTaskCount: number;
constructor(minFailedTaskCount: number, minSucceededTaskCount: number) {
this.minFailedTaskCount = minFailedTaskCount;
this.minSucceededTaskCount = minSucceededTaskCount;
}
}
/**
* Trial job configuration for FrameworkController
*/
export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfigTemplate{
public readonly frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy;
public readonly name: string;
public readonly taskNum: number;
constructor(taskNum: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string,
frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy) {
super(command, gpuNum, cpuNum, memoryMB, image);
this.frameworkAttemptCompletionPolicy = frameworkAttemptCompletionPolicy;
this.name = name;
this.taskNum = taskNum;
}
}
export class FrameworkControllerTrialConfig extends KubernetesTrialConfig{
public readonly taskRoles: FrameworkControllerTrialConfigTemplate[];
public readonly codeDir: string;
constructor(codeDir: string, taskRoles: FrameworkControllerTrialConfigTemplate[]) {
super(codeDir);
this.taskRoles = taskRoles;
this.codeDir = codeDir;
}
}
export type FrameworkControllerJobStatus = 'AttemptRunning' | 'Completed' | 'AttemptCreationPending' | 'AttemptCreationRequested' | 'AttemptPreparing' | 'AttemptCompleted';
export type FrameworkControllerJobCompleteStatus = 'Succeeded' | 'Failed';
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { KubernetesTrialJobDetail} from '../kubernetesData';
import { KubernetesCRDClient } from '../kubernetesApiClient';
import { KubernetesJobInfoCollector } from '../kubernetesJobInfoCollector';
import { FrameworkControllerJobStatus, FrameworkControllerJobCompleteStatus } from './frameworkcontrollerConfig';
/**
* Collector frameworkcontroller jobs info from Kubernetes cluster, and update frameworkcontroller job status locally
*/
export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollector{
constructor(jobMap: Map<string, KubernetesTrialJobDetail>) {
super(jobMap);
}
protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> {
if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) {
return Promise.resolve();
}
if(kubernetesCRDClient === undefined) {
return Promise.reject('kubernetesCRDClient is undefined');
}
let kubernetesJobInfo: any;
try {
kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName);
} catch(error) {
this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`);
//This is not treat as a error status
return Promise.resolve();
}
if(kubernetesJobInfo.status && kubernetesJobInfo.status.state) {
const frameworkJobType: FrameworkControllerJobStatus = <FrameworkControllerJobStatus>kubernetesJobInfo.status.state;
switch(frameworkJobType) {
case 'AttemptCreationPending' || 'AttemptCreationRequested' || 'AttemptPreparing':
kubernetesTrialJob.status = 'WAITING';
break;
case 'AttemptRunning':
kubernetesTrialJob.status = 'RUNNING';
if(!kubernetesTrialJob.startTime) {
kubernetesTrialJob.startTime = Date.parse(<string>kubernetesJobInfo.status.startTime);
}
break;
case 'Completed':
const completedJobType : FrameworkControllerJobCompleteStatus = <FrameworkControllerJobCompleteStatus>kubernetesJobInfo.status.attemptStatus.completionStatus.type.name;
switch(completedJobType) {
case 'Succeeded':
kubernetesTrialJob.status = 'SUCCEEDED';
break;
case 'Failed':
kubernetesTrialJob.status = 'FAILED';
break;
}
kubernetesTrialJob.endTime = Date.parse(<string>kubernetesJobInfo.status.completionTime);
break;
default:
break;
}
}
return Promise.resolve();
}
}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as component from '../../../common/component';
import { FrameworkControllerTrainingService } from './frameworkcontrollerTrainingService';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
/**
* frameworkcontroller Training service Rest server, provides rest API to support frameworkcontroller job metrics update
*
*/
@component.Singleton
export class FrameworkControllerJobRestServer extends KubernetesJobRestServer{
constructor() {
super(component.get(FrameworkControllerTrainingService));
}
}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict'
import * as component from '../../../common/component';
import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import * as path from 'path';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
import { getExperimentId } from '../../../common/experimentStartupInfo';
import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
import {
JobApplicationForm, TrialJobApplicationForm,
TrialJobDetail, NNIManagerIpConfig
} from '../../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils';
import { NFSConfig, KubernetesClusterConfigNFS, KubernetesClusterConfigAzure, KubernetesClusterConfigFactory } from '../kubernetesConfig'
import { KubernetesTrialJobDetail } from '../kubernetesData';
import { validateCodeDir } from '../../common/util';
import { AzureStorageClientUtility } from '../azureStorageClientUtils';
import { KubernetesTrainingService } from '../kubernetesTrainingService';
import { FrameworkControllerTrialConfig } from './frameworkcontrollerConfig';
import { FrameworkControllerJobRestServer } from './frameworkcontrollerJobRestServer';
import { FrameworkControllerClient } from './frameworkcontrollerApiClient';
import { FrameworkControllerJobInfoCollector } from './frameworkcontrollerJobInfoCollector';
/**
* Training Service implementation for frameworkcontroller
*/
@component.Singleton
class FrameworkControllerTrainingService extends KubernetesTrainingService implements KubernetesTrainingService {
private frameworkcontrollerTrialConfig?: FrameworkControllerTrialConfig;
private frameworkcontrollerJobInfoCollector: FrameworkControllerJobInfoCollector;
constructor() {
super();
this.frameworkcontrollerJobInfoCollector = new FrameworkControllerJobInfoCollector(this.trialJobsMap);
this.experimentId = getExperimentId();
this.nextTrialSequenceId = -1;
}
public async run(): Promise<void> {
this.kubernetesJobRestServer = component.get(FrameworkControllerJobRestServer);
if(!this.kubernetesJobRestServer) {
throw new Error('kubernetesJobRestServer not initialized!');
}
await this.kubernetesJobRestServer.start();
this.log.info(`frameworkcontroller Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`);
while (!this.stopping) {
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
await delay(3000);
await this.frameworkcontrollerJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient);
}
}
public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> {
if(!this.kubernetesClusterConfig) {
throw new Error('frameworkcontrollerClusterConfig is not initialized');
}
if(!this.kubernetesCRDClient) {
throw new Error('kubernetesCRDClient is undefined');
}
if(!this.kubernetesRestServerPort) {
const restServer: FrameworkControllerJobRestServer = component.get(FrameworkControllerJobRestServer);
this.kubernetesRestServerPort = restServer.clusterRestServerPort;
}
const trialJobId: string = uniqueString(5);
const curTrialSequenceId: number = this.generateSequenceId();
// Set trial's NFS working folder
const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId);
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
const frameworkcontrollerJobName = `nniexp${this.experimentId}trial${trialJobId}`.toLowerCase();
await this.prepareRunScript(trialLocalTempFolder, curTrialSequenceId, trialJobId, trialWorkingFolder, form);
//upload code files
let trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder);
const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail(
trialJobId,
'WAITING',
Date.now(),
trialWorkingFolder,
form,
frameworkcontrollerJobName,
curTrialSequenceId,
trialJobOutputUrl
);
// Set trial job detail until create frameworkcontroller job successfully
this.trialJobsMap.set(trialJobId, trialJobDetail);
// Create frameworkcontroller job based on generated frameworkcontroller job resource config
const frameworkcontrollerJobConfig = await this.prepareFrameworkControllerConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName);
await this.kubernetesCRDClient.createKubernetesJob(frameworkcontrollerJobConfig);
// Set trial job detail until create frameworkcontroller job successfully
this.trialJobsMap.set(trialJobId, trialJobDetail);
return Promise.resolve(trialJobDetail);
}
/**
* upload code files to nfs or azureStroage
* @param trialJobId
* @param trialLocalTempFolder
* return: trialJobOutputUrl
*/
private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise<string> {
if(!this.kubernetesClusterConfig) {
throw new Error('Kubeflow Cluster config is not initialized');
}
let trialJobOutputUrl: string = '';
if(this.kubernetesClusterConfig.storageType === 'azureStorage') {
try{
//upload local files to azure storage
await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`);
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}`
}catch(error){
this.log.error(error);
return Promise.reject(error);
}
} else if(this.kubernetesClusterConfig.storageType === 'nfs') {
let nfsFrameworkControllerClusterConfig: KubernetesClusterConfigNFS = <KubernetesClusterConfigNFS>this.kubernetesClusterConfig;
// Creat work dir for current trial in NFS directory
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`);
// Copy code files from local dir to NFS mounted dir
await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
const nfsConfig: NFSConfig = nfsFrameworkControllerClusterConfig.nfs;
trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`
}
return Promise.resolve(trialJobOutputUrl);
}
private async prepareRunScript(trialLocalTempFolder: string, curTrialSequenceId: number, trialJobId: string, trialWorkingFolder: string, form: JobApplicationForm): Promise<void> {
if(!this.frameworkcontrollerTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized');
}
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${this.frameworkcontrollerTrialConfig.codeDir} ${trialLocalTempFolder}`);
const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });
// Create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
for(let taskRole of this.frameworkcontrollerTrialConfig.taskRoles) {
const runScriptContent: string = this.generateRunScript('frameworkcontroller', trialJobId, trialWorkingFolder,
taskRole.command, curTrialSequenceId.toString(), taskRole.name, taskRole.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, `run_${taskRole.name}.sh`), runScriptContent, { encoding: 'utf8' });
}
// Write file content ( parameter.cfg ) to local tmp folders
const trialForm : TrialJobApplicationForm = (<TrialJobApplicationForm>form)
if(trialForm && trialForm.hyperParameters) {
await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)),
trialForm.hyperParameters.value, { encoding: 'utf8' });
}
}
private async prepareFrameworkControllerConfig(trialJobId: string, trialWorkingFolder: string, frameworkcontrollerJobName: string): Promise<any> {
if(!this.frameworkcontrollerTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized');
}
const podResources : any = [];
for(let taskRole of this.frameworkcontrollerTrialConfig.taskRoles) {
let resource: any = {};
resource.requests = this.generatePodResource(taskRole.memoryMB, taskRole.cpuNum, taskRole.gpuNum);
resource.limits = Object.assign({}, resource.requests);
podResources.push(resource);
}
// Generate frameworkcontroller job resource config object
const frameworkcontrollerJobConfig: any = this.generateFrameworkControllerJobConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName, podResources);
return Promise.resolve(frameworkcontrollerJobConfig);
}
public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) {
case TrialConfigMetadataKey.NNI_MANAGER_IP:
this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value);
break;
case TrialConfigMetadataKey.FRAMEWORKCONTROLLER_CLUSTER_CONFIG:
let frameworkcontrollerClusterJsonObject = JSON.parse(value);
this.kubernetesClusterConfig = KubernetesClusterConfigFactory.generateKubernetesClusterConfig(frameworkcontrollerClusterJsonObject);
if(this.kubernetesClusterConfig.storageType === 'azureStorage') {
let azureFrameworkControllerClusterConfig = <KubernetesClusterConfigAzure>this.kubernetesClusterConfig;
this.azureStorageAccountName = azureFrameworkControllerClusterConfig.azureStorage.accountName;
this.azureStorageShare = azureFrameworkControllerClusterConfig.azureStorage.azureShare;
await this.createAzureStorage(
azureFrameworkControllerClusterConfig.keyVault.vaultName,
azureFrameworkControllerClusterConfig.keyVault.name,
azureFrameworkControllerClusterConfig.azureStorage.accountName,
azureFrameworkControllerClusterConfig.azureStorage.azureShare
);
} else if(this.kubernetesClusterConfig.storageType === 'nfs') {
let nfsFrameworkControllerClusterConfig = <KubernetesClusterConfigNFS>this.kubernetesClusterConfig;
await this.createNFSStorage(
nfsFrameworkControllerClusterConfig.nfs.server,
nfsFrameworkControllerClusterConfig.nfs.path
);
}
this.kubernetesCRDClient = FrameworkControllerClient.generateFrameworkControllerClient();
break;
case TrialConfigMetadataKey.TRIAL_CONFIG:
let frameworkcontrollerTrialJsonObjsect = JSON.parse(value);
this.frameworkcontrollerTrialConfig = new FrameworkControllerTrialConfig(
frameworkcontrollerTrialJsonObjsect.codeDir,
frameworkcontrollerTrialJsonObjsect.taskRoles
);
// Validate to make sure codeDir doesn't have too many files
try {
await validateCodeDir(this.frameworkcontrollerTrialConfig.codeDir);
} catch(error) {
this.log.error(error);
return Promise.reject(new Error(error));
}
break;
default:
break;
}
return Promise.resolve();
}
/**
* Generate frameworkcontroller resource config file
* @param trialJobId trial job id
* @param trialWorkingFolder working folder
* @param frameworkcontrollerJobName job name
* @param podResources pod template
*/
private generateFrameworkControllerJobConfig(trialJobId: string, trialWorkingFolder: string, frameworkcontrollerJobName : string, podResources : any) : any {
if(!this.kubernetesClusterConfig) {
throw new Error('frameworkcontroller Cluster config is not initialized');
}
if(!this.frameworkcontrollerTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized');
}
let taskRoles = [];
for(let index in this.frameworkcontrollerTrialConfig.taskRoles) {
let taskRole = this.generateTaskRoleConfig(
trialWorkingFolder,
this.frameworkcontrollerTrialConfig.taskRoles[index].image,
`run_${this.frameworkcontrollerTrialConfig.taskRoles[index].name}.sh`,
podResources[index]
);
taskRoles.push({
name: this.frameworkcontrollerTrialConfig.taskRoles[index].name,
taskNumber: this.frameworkcontrollerTrialConfig.taskRoles[index].taskNum,
frameworkAttemptCompletionPolicy: {
minFailedTaskCount: this.frameworkcontrollerTrialConfig.taskRoles[index].frameworkAttemptCompletionPolicy.minFailedTaskCount,
minSucceededTaskCount: this.frameworkcontrollerTrialConfig.taskRoles[index].frameworkAttemptCompletionPolicy.minSucceededTaskCount
},
task: taskRole
});
}
return {
apiVersion: `frameworkcontroller.microsoft.com/v1`,
kind: 'Framework',
metadata: {
name: frameworkcontrollerJobName,
namespace: 'default',
labels: {
app: this.NNI_KUBERNETES_TRIAL_LABEL,
expId: getExperimentId(),
trialId: trialJobId
}
},
spec: {
executionType: 'Start',
taskRoles: taskRoles
}
};
}
private generateTaskRoleConfig(trialWorkingFolder: string, replicaImage: string, runScriptFile: string, podResources: any): any {
if(!this.kubernetesClusterConfig) {
throw new Error('frameworkcontroller Cluster config is not initialized');
}
if(!this.frameworkcontrollerTrialConfig) {
throw new Error('frameworkcontroller trial config is not initialized');
}
let volumeSpecMap = new Map<string, object>();
if(this.kubernetesClusterConfig.storageType === 'azureStorage'){
volumeSpecMap.set('nniVolumes', [
{
name: 'nni-vol',
azureFile: {
secretName: `${this.azureStorageSecretName}`,
shareName: `${this.azureStorageShare}`,
readonly: false
}
}])
}else {
let frameworkcontrollerClusterConfigNFS: KubernetesClusterConfigNFS = <KubernetesClusterConfigNFS> this.kubernetesClusterConfig;
volumeSpecMap.set('nniVolumes', [
{
name: 'nni-vol',
nfs: {
server: `${frameworkcontrollerClusterConfigNFS.nfs.server}`,
path: `${frameworkcontrollerClusterConfigNFS.nfs.path}`
}
}])
}
let taskRole = {
pod: {
spec: {
containers: [
{
name: 'framework',
image: replicaImage,
args: ["sh", `${path.join(trialWorkingFolder, runScriptFile)}`],
volumeMounts: [
{
name: 'nni-vol',
mountPath: this.CONTAINER_MOUNT_PATH
}],
resources: podResources
}],
restartPolicy: 'OnFailure',
volumes: volumeSpecMap.get('nniVolumes')
}
}
}
return taskRole;
}
}
export { FrameworkControllerTrainingService }
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as fs from 'fs';
import { KubeflowOperator } from './kubeflowConfig';
import { KubernetesCRDClient, GeneralK8sClient } from '../kubernetesApiClient';
abstract class KubeflowOperatorClient extends KubernetesCRDClient{
/**
* Factory method to generate operator cliet
*/
public static generateOperatorClient(kubeflowOperator: KubeflowOperator,
operatorApiVersion: string): KubernetesCRDClient {
if(kubeflowOperator === 'tf-operator') {
if(operatorApiVersion == 'v1alpha2') {
return new TFOperatorClientV1Alpha2();
} else if(operatorApiVersion == 'v1beta1') {
return new TFOperatorClientV1Beta1();
}
} else if(kubeflowOperator === 'pytorch-operator') {
if(operatorApiVersion == 'v1alpha2') {
return new PytorchOperatorClientV1Alpha2();
} else if(operatorApiVersion == 'v1beta1') {
return new PytorchOperatorClientV1Beta1();
}
}
throw new Error(`Invalid operator ${kubeflowOperator} or apiVersion ${operatorApiVersion}`);
}
}
class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/tfjob-crd-v1alpha2.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1alpha2.namespaces('default').tfjobs;
}
public get containerName(): string {
return 'tensorflow';
}
}
class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/tfjob-crd-v1beta1.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta1.namespaces('default').tfjobs;
}
public get containerName(): string {
return 'tensorflow';
}
}
class PytorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/pytorchjob-crd-v1alpha2.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1alpha2.namespaces('default').pytorchjobs;
}
public get containerName(): string {
return 'pytorch';
}
}
class PytorchOperatorClientV1Beta1 extends KubernetesCRDClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/pytorchjob-crd-v1beta1.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta1.namespaces('default').pytorchjobs;
}
public get containerName(): string {
return 'pytorch';
}
}
export { KubeflowOperatorClient, GeneralK8sClient };
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as assert from 'assert';
import { KubernetesClusterConfigAzure, KubernetesClusterConfigNFS, KubernetesStorageKind, NFSConfig, AzureStorage, keyVaultConfig,
KubernetesTrialConfig, KubernetesTrialConfigTemplate, StorageConfig, KubernetesClusterConfig } from '../kubernetesConfig'
import { MethodNotImplementedError } from '../../../common/errors';
/** operator types that kubeflow supported */
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ;
export type DistTrainRole = 'worker' | 'ps' | 'master';
export type KubeflowJobStatus = 'Created' | 'Running' | 'Failed' | 'Succeeded';
export type OperatorApiVersion = 'v1alpha2' | 'v1beta1';
export class KubeflowClusterConfig extends KubernetesClusterConfig {
public readonly operator: KubeflowOperator;
constructor(codeDir: string, operator: KubeflowOperator) {
super(codeDir);
this.operator = operator;
}
}
export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
public readonly operator: KubeflowOperator;
constructor(
operator: KubeflowOperator,
apiVersion: string,
nfs: NFSConfig,
storage?: KubernetesStorageKind
) {
super(apiVersion, nfs, storage);
this.operator = operator;
}
public get storageType(): KubernetesStorageKind {
return 'nfs';
}
public static getInstance(jsonObject: object): KubeflowClusterConfigNFS {
let kubeflowClusterConfigObjectNFS = <KubeflowClusterConfigNFS>jsonObject;
assert (kubeflowClusterConfigObjectNFS !== undefined)
return new KubeflowClusterConfigNFS(
kubeflowClusterConfigObjectNFS.operator,
kubeflowClusterConfigObjectNFS.apiVersion,
kubeflowClusterConfigObjectNFS.nfs,
kubeflowClusterConfigObjectNFS.storage
);
}
}
export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure{
public readonly operator: KubeflowOperator;
constructor(
operator: KubeflowOperator,
apiVersion: string,
keyVault: keyVaultConfig,
azureStorage: AzureStorage,
storage?: KubernetesStorageKind
) {
super(apiVersion, keyVault, azureStorage,storage);
this.operator = operator;
}
public get storageType(): KubernetesStorageKind{
return 'azureStorage';
}
public static getInstance(jsonObject: object): KubeflowClusterConfigAzure {
let kubeflowClusterConfigObjectAzure = <KubeflowClusterConfigAzure>jsonObject;
return new KubeflowClusterConfigAzure(
kubeflowClusterConfigObjectAzure.operator,
kubeflowClusterConfigObjectAzure.apiVersion,
kubeflowClusterConfigObjectAzure.keyVault,
kubeflowClusterConfigObjectAzure.azureStorage,
kubeflowClusterConfigObjectAzure.storage
);
}
}
export class KubeflowClusterConfigFactory {
public static generateKubeflowClusterConfig(jsonObject: object): KubeflowClusterConfig {
let storageConfig = <StorageConfig>jsonObject;
if(!storageConfig) {
throw new Error("Invalid json object as a StorageConfig instance");
}
if(storageConfig.storage && storageConfig.storage === 'azureStorage') {
return KubeflowClusterConfigAzure.getInstance(jsonObject);
} else if (storageConfig.storage === undefined || storageConfig.storage === 'nfs') {
return KubeflowClusterConfigNFS.getInstance(jsonObject);
}
throw new Error(`Invalid json object ${jsonObject}`);
}
}
export class KubeflowTrialConfig extends KubernetesTrialConfig {
constructor(codeDir: string) {
super(codeDir);
}
public get operatorType(): KubeflowOperator {
throw new MethodNotImplementedError();
}
}
export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate{
public readonly replicas: number;
constructor(replicas: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) {
super(command, gpuNum, cpuNum, memoryMB, image);
this.replicas = replicas;
}
}
export class KubeflowTrialConfigTensorflow extends KubeflowTrialConfig {
public readonly ps?: KubeflowTrialConfigTemplate;
public readonly worker: KubeflowTrialConfigTemplate;
constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, ps?: KubeflowTrialConfigTemplate) {
super(codeDir);
this.ps = ps;
this.worker = worker;
}
public get operatorType(): KubeflowOperator {
return 'tf-operator';
}
}
export class KubeflowTrialConfigPytorch extends KubeflowTrialConfig {
public readonly master: KubeflowTrialConfigTemplate;
public readonly worker?: KubeflowTrialConfigTemplate;
constructor(codeDir: string, master: KubeflowTrialConfigTemplate, worker?: KubeflowTrialConfigTemplate) {
super(codeDir);
this.master = master;
this.worker = worker;
}
public get operatorType(): KubeflowOperator {
return 'pytorch-operator';
}
}
export class KubeflowTrialConfigFactory {
public static generateKubeflowTrialConfig(jsonObject: object, operator: KubeflowOperator): KubeflowTrialConfig {
if(operator === 'tf-operator'){
let kubeflowTrialConfigObject = <KubeflowTrialConfigTensorflow>jsonObject;
return new KubeflowTrialConfigTensorflow(
kubeflowTrialConfigObject.codeDir,
kubeflowTrialConfigObject.worker,
kubeflowTrialConfigObject.ps
);
}else if(operator === 'pytorch-operator'){
let kubeflowTrialConfigObject = <KubeflowTrialConfigPytorch>jsonObject;
return new KubeflowTrialConfigPytorch(
kubeflowTrialConfigObject.codeDir,
kubeflowTrialConfigObject.master,
kubeflowTrialConfigObject.worker
);
}
throw new Error(`Invalid json object ${jsonObject}`);
}
}
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import { KubernetesTrialJobDetail} from '../kubernetesData';
import { KubernetesCRDClient } from '../kubernetesApiClient';
import { KubernetesJobInfoCollector } from '../kubernetesJobInfoCollector';
import { KubeflowJobStatus } from './kubeflowConfig';
/**
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
*/
export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
constructor(jobMap: Map<string, KubernetesTrialJobDetail>) {
super(jobMap);
}
protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> {
if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) {
return Promise.resolve();
}
if(kubernetesCRDClient === undefined) {
return Promise.reject('kubernetesCRDClient is undefined');
}
let kubernetesJobInfo: any;
try {
kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName);
} catch(error) {
this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`);
//This is not treat as a error status
return Promise.resolve();
}
if(kubernetesJobInfo.status && kubernetesJobInfo.status.conditions) {
const latestCondition = kubernetesJobInfo.status.conditions[kubernetesJobInfo.status.conditions.length - 1];
const tfJobType : KubeflowJobStatus = <KubeflowJobStatus>latestCondition.type;
switch(tfJobType) {
case 'Created':
kubernetesTrialJob.status = 'WAITING';
kubernetesTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
case 'Running':
kubernetesTrialJob.status = 'RUNNING';
if(!kubernetesTrialJob.startTime) {
kubernetesTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
}
break;
case 'Failed':
kubernetesTrialJob.status = 'FAILED';
kubernetesTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
case 'Succeeded':
kubernetesTrialJob.status = 'SUCCEEDED';
kubernetesTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
default:
break;
}
}
return Promise.resolve();
}
}
\ No newline at end of file
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as component from '../../../common/component';
import { KubeflowTrainingService } from './kubeflowTrainingService';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*/
@component.Singleton
export class KubeflowJobRestServer extends KubernetesJobRestServer{
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
constructor() {
super(component.get(KubeflowTrainingService));
}
}
\ No newline at end of file
...@@ -20,103 +20,150 @@ ...@@ -20,103 +20,150 @@
'use strict' 'use strict'
import * as assert from 'assert'; import * as assert from 'assert';
import * as azureStorage from 'azure-storage'; import * as component from '../../../common/component';
import * as component from '../../common/component';
import * as cpp from 'child-process-promise'; import * as cpp from 'child-process-promise';
import * as fs from 'fs'; import * as fs from 'fs';
import * as path from 'path'; import * as path from 'path';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
import { EventEmitter } from 'events'; import { getExperimentId } from '../../../common/experimentStartupInfo';
import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo'; import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
import { getLogger, Logger } from '../../common/log';
import { MethodNotImplementedError } from '../../common/errors';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { import {
JobApplicationForm, TrainingService, TrialJobApplicationForm, JobApplicationForm, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric, NNIManagerIpConfig TrialJobDetail, NNIManagerIpConfig
} from '../../common/trainingService'; } from '../../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, getIPV4Address, uniqueString, getJobCancelStatus } from '../../common/utils'; import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils';
import { DistTrainRole, KubeflowClusterConfigBase, KubeflowClusterConfigNFS, KubeflowClusterConfigAzure, KubeflowTrialConfigBase, import { KubeflowClusterConfigNFS, KubeflowClusterConfigAzure,
KubeflowTrialConfigPytorch, KubeflowTrialConfigTensorflow, NFSConfig } from './kubeflowConfig'; KubeflowTrialConfigPytorch, KubeflowTrialConfigTensorflow, KubeflowClusterConfigFactory, KubeflowTrialConfigFactory,
import { KubeflowTrialJobDetail } from './kubeflowData'; KubeflowTrialConfig, KubeflowClusterConfig } from './kubeflowConfig';
import { NFSConfig } from '../kubernetesConfig'
import { KubernetesTrialJobDetail } from '../kubernetesData';
import { KubeflowJobRestServer } from './kubeflowJobRestServer'; import { KubeflowJobRestServer } from './kubeflowJobRestServer';
import { validateCodeDir } from '../../common/util';
import { AzureStorageClientUtility } from '../azureStorageClientUtils';
import { KubeflowOperatorClient } from './kubeflowApiClient';
import { KubernetesTrainingService } from '../kubernetesTrainingService'
import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector'; import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector';
import { validateCodeDir } from '../common/util';
import { AzureStorageClientUtility } from './azureStorageClientUtils';
import { GeneralK8sClient, KubeflowOperatorClient } from './kubernetesApiClient';
var azure = require('azure-storage');
var base64 = require('js-base64').Base64;
/** /**
* Training Service implementation for Kubeflow * Training Service implementation for Kubeflow
* Refer https://github.com/kubeflow/kubeflow for more info about Kubeflow * Refer https://github.com/kubeflow/kubeflow for more info about Kubeflow
*/ */
@component.Singleton @component.Singleton
class KubeflowTrainingService implements TrainingService { class KubeflowTrainingService extends KubernetesTrainingService implements KubernetesTrainingService {
private readonly NNI_KUBEFLOW_TRIAL_LABEL: string = 'nni-kubeflow-trial'; private kubeflowClusterConfig?: KubeflowClusterConfig;
private readonly log!: Logger; private kubeflowTrialConfig?: KubeflowTrialConfig;
private readonly metricsEmitter: EventEmitter;
private readonly trialJobsMap: Map<string, KubeflowTrialJobDetail>;
/** experiment root dir in NFS */
private readonly trialLocalNFSTempFolder: string;
private stopping: boolean = false;
private experimentId! : string;
private nextTrialSequenceId: number;
private kubeflowClusterConfig?: KubeflowClusterConfigBase;
private kubeflowTrialConfig?: KubeflowTrialConfigBase;
private kubeflowJobInfoCollector: KubeflowJobInfoCollector; private kubeflowJobInfoCollector: KubeflowJobInfoCollector;
private kubeflowRestServerPort?: number;
private operatorClient?: KubeflowOperatorClient;
private readonly genericK8sClient: GeneralK8sClient;
private readonly CONTAINER_MOUNT_PATH: string;
private azureStorageClient?: azureStorage.FileService;
private azureStorageShare?: string;
private azureStorageSecretName?: string;
private azureStorageAccountName?: string;
private nniManagerIpConfig?: NNIManagerIpConfig;
constructor() { constructor() {
this.log = getLogger(); super();
this.metricsEmitter = new EventEmitter();
this.trialJobsMap = new Map<string, KubeflowTrialJobDetail>();
this.genericK8sClient = new GeneralK8sClient();
this.kubeflowJobInfoCollector = new KubeflowJobInfoCollector(this.trialJobsMap); this.kubeflowJobInfoCollector = new KubeflowJobInfoCollector(this.trialJobsMap);
this.trialLocalNFSTempFolder = path.join(getExperimentRootDir(), 'trials-nfs-tmp');
this.experimentId = getExperimentId(); this.experimentId = getExperimentId();
this.nextTrialSequenceId = -1; this.nextTrialSequenceId = -1;
this.CONTAINER_MOUNT_PATH = '/tmp/mount';
} }
public async run(): Promise<void> { public async run(): Promise<void> {
const restServer: KubeflowJobRestServer = component.get(KubeflowJobRestServer); this.kubernetesJobRestServer = component.get(KubeflowJobRestServer);
await restServer.start(); if(!this.kubernetesJobRestServer) {
this.log.info(`Kubeflow Training service rest server listening on: ${restServer.endPoint}`); throw new Error('kubernetesJobRestServer not initialized!');
}
await this.kubernetesJobRestServer.start();
this.log.info(`Kubeflow Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`);
while (!this.stopping) { while (!this.stopping) {
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server // collect metrics for Kubeflow jobs by interacting with Kubernetes API server
await delay(3000); await delay(3000);
await this.kubeflowJobInfoCollector.retrieveTrialStatus(this.operatorClient); await this.kubeflowJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient);
} }
} }
public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> { public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> {
if(!this.kubernetesCRDClient) {
throw new Error('Kubeflow job operator client is undefined');
}
if(!this.kubernetesRestServerPort) {
const restServer: KubeflowJobRestServer = component.get(KubeflowJobRestServer);
this.kubernetesRestServerPort = restServer.clusterRestServerPort;
}
const trialJobId: string = uniqueString(5);
const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId);
const kubeflowJobName = `nni-exp-${this.experimentId}-trial-${trialJobId}`.toLowerCase();
const curTrialSequenceId: number = this.generateSequenceId();
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
//prepare the runscript
await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, curTrialSequenceId, form);
//upload files to sotrage
const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder);
const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail(
trialJobId,
'WAITING',
Date.now(),
trialWorkingFolder,
form,
kubeflowJobName,
curTrialSequenceId,
trialJobOutputUrl
);
// Generate kubeflow job resource config object
const kubeflowJobConfig: any = await this.prepareKubeflowConfig(trialJobId, trialWorkingFolder, kubeflowJobName);
// Create kubeflow job based on generated kubeflow job resource config
await this.kubernetesCRDClient.createKubernetesJob(kubeflowJobConfig);
// Set trial job detail until create Kubeflow job successfully
this.trialJobsMap.set(trialJobId, trialJobDetail);
return Promise.resolve(trialJobDetail);
}
/**
* upload code files to nfs or azureStroage
* @param trialJobId
* @param trialLocalTempFolder
* return: trialJobOutputUrl
*/
private async uploadCodeFiles(trialJobId: string, trialLocalTempFolder: string): Promise<string> {
if(!this.kubeflowClusterConfig) { if(!this.kubeflowClusterConfig) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
if(!this.kubeflowTrialConfig) { let trialJobOutputUrl: string = '';
throw new Error('Kubeflow trial config is not initialized');
assert(!this.kubeflowClusterConfig.storage
|| this.kubeflowClusterConfig.storage === 'azureStorage'
|| this.kubeflowClusterConfig.storage === 'nfs');
if(this.kubeflowClusterConfig.storage === 'azureStorage') {
try{
//upload local files to azure storage
await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`);
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}`
}catch(error){
this.log.error(error);
return Promise.reject(error);
}
} else if(this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) {
let nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig;
// Creat work dir for current trial in NFS directory
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`);
// Copy code files from local dir to NFS mounted dir
await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
const nfsConfig: NFSConfig = nfsKubeflowClusterConfig.nfs;
trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`
} }
if(!this.operatorClient) { return Promise.resolve(trialJobOutputUrl);
throw new Error('Kubeflow job operator client is undefined');
} }
if(!this.kubeflowRestServerPort) { private async prepareRunScript(trialLocalTempFolder: string, trialJobId: string, trialWorkingFolder: string, curTrialSequenceId: number, form: JobApplicationForm): Promise<void> {
const restServer: KubeflowJobRestServer = component.get(KubeflowJobRestServer); if(!this.kubeflowClusterConfig) {
this.kubeflowRestServerPort = restServer.clusterRestServerPort; throw new Error('Kubeflow Cluster config is not initialized');
} }
// initialize kubeflow trial config to specific type // initialize kubeflow trial config to specific type
let kubeflowTrialConfig; let kubeflowTrialConfig;
if(this.kubeflowClusterConfig.operator === 'tf-operator') { if(this.kubeflowClusterConfig.operator === 'tf-operator') {
...@@ -127,11 +174,6 @@ class KubeflowTrainingService implements TrainingService { ...@@ -127,11 +174,6 @@ class KubeflowTrainingService implements TrainingService {
throw Error(`operator ${this.kubeflowClusterConfig.operator} is invalid`) throw Error(`operator ${this.kubeflowClusterConfig.operator} is invalid`)
} }
const trialJobId: string = uniqueString(5);
const curTrialSequenceId: number = this.generateSequenceId();
// Set trial's NFS working folder
const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId);
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
//create tmp trial working folder locally. //create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`); await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`);
await cpp.exec(`cp -r ${kubeflowTrialConfig.codeDir} ${trialLocalTempFolder}`); await cpp.exec(`cp -r ${kubeflowTrialConfig.codeDir} ${trialLocalTempFolder}`);
...@@ -143,7 +185,7 @@ class KubeflowTrainingService implements TrainingService { ...@@ -143,7 +185,7 @@ class KubeflowTrainingService implements TrainingService {
// Write worker file content run_worker.sh to local tmp folders // Write worker file content run_worker.sh to local tmp folders
if(kubeflowTrialConfig.worker) { if(kubeflowTrialConfig.worker) {
const workerRunScriptContent: string = this.generateRunScript(trialJobId, trialWorkingFolder, const workerRunScriptContent: string = this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder,
kubeflowTrialConfig.worker.command, curTrialSequenceId.toString(), 'worker', kubeflowTrialConfig.worker.gpuNum); kubeflowTrialConfig.worker.command, curTrialSequenceId.toString(), 'worker', kubeflowTrialConfig.worker.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_worker.sh'), workerRunScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_worker.sh'), workerRunScriptContent, { encoding: 'utf8' });
...@@ -152,7 +194,7 @@ class KubeflowTrainingService implements TrainingService { ...@@ -152,7 +194,7 @@ class KubeflowTrainingService implements TrainingService {
if(this.kubeflowClusterConfig.operator === 'tf-operator') { if(this.kubeflowClusterConfig.operator === 'tf-operator') {
let tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig; let tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
if(tensorflowTrialConfig.ps){ if(tensorflowTrialConfig.ps){
const psRunScriptContent: string = this.generateRunScript(trialJobId, trialWorkingFolder, const psRunScriptContent: string = this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder,
tensorflowTrialConfig.ps.command, curTrialSequenceId.toString(), 'ps', tensorflowTrialConfig.ps.gpuNum); tensorflowTrialConfig.ps.command, curTrialSequenceId.toString(), 'ps', tensorflowTrialConfig.ps.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_ps.sh'), psRunScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_ps.sh'), psRunScriptContent, { encoding: 'utf8' });
} }
...@@ -160,7 +202,7 @@ class KubeflowTrainingService implements TrainingService { ...@@ -160,7 +202,7 @@ class KubeflowTrainingService implements TrainingService {
else if(this.kubeflowClusterConfig.operator === 'pytorch-operator') { else if(this.kubeflowClusterConfig.operator === 'pytorch-operator') {
let pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig; let pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
if(pytorchTrialConfig.master){ if(pytorchTrialConfig.master){
const masterRunScriptContent: string = this.generateRunScript(trialJobId, trialWorkingFolder, const masterRunScriptContent: string = this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder,
pytorchTrialConfig.master.command, curTrialSequenceId.toString(), 'master', pytorchTrialConfig.master.gpuNum); pytorchTrialConfig.master.command, curTrialSequenceId.toString(), 'master', pytorchTrialConfig.master.gpuNum);
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_master.sh'), masterRunScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_master.sh'), masterRunScriptContent, { encoding: 'utf8' });
} }
...@@ -171,7 +213,26 @@ class KubeflowTrainingService implements TrainingService { ...@@ -171,7 +213,26 @@ class KubeflowTrainingService implements TrainingService {
await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)), await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(trialForm.hyperParameters)),
trialForm.hyperParameters.value, { encoding: 'utf8' }); trialForm.hyperParameters.value, { encoding: 'utf8' });
} }
const kubeflowJobName = `nni-exp-${this.experimentId}-trial-${trialJobId}`.toLowerCase(); }
private async prepareKubeflowConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName: string): Promise<any> {
if(!this.kubeflowClusterConfig) {
throw new Error('Kubeflow Cluster config is not initialized');
}
if(!this.kubeflowTrialConfig) {
throw new Error('Kubeflow trial config is not initialized');
}
// initialize kubeflow trial config to specific type
let kubeflowTrialConfig;
if(this.kubeflowClusterConfig.operator === 'tf-operator') {
kubeflowTrialConfig = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
}else if(this.kubeflowClusterConfig.operator === 'pytorch-operator'){
kubeflowTrialConfig = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
}else {
throw Error(`operator ${this.kubeflowClusterConfig.operator} is invalid`)
}
const workerPodResources : any = {}; const workerPodResources : any = {};
if(kubeflowTrialConfig.worker) { if(kubeflowTrialConfig.worker) {
...@@ -196,139 +257,10 @@ class KubeflowTrainingService implements TrainingService { ...@@ -196,139 +257,10 @@ class KubeflowTrainingService implements TrainingService {
} }
//The output url used in trialJobDetail
let trialJobOutputUrl: string = '';
assert(!this.kubeflowClusterConfig.storage
|| this.kubeflowClusterConfig.storage === 'azureStorage'
|| this.kubeflowClusterConfig.storage === 'nfs');
if(this.kubeflowClusterConfig.storage === 'azureStorage') {
try{
//upload local files to azure storage
await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`);
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}`
}catch(error){
this.log.error(error);
return Promise.reject(error);
}
} else if(this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) {
let nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig;
// Creat work dir for current trial in NFS directory
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`);
// Copy code files from local dir to NFS mounted dir
await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
const nfsConfig: NFSConfig = nfsKubeflowClusterConfig.nfs;
trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`
}
const trialJobDetail: KubeflowTrialJobDetail = new KubeflowTrialJobDetail(
trialJobId,
'WAITING',
Date.now(),
trialWorkingFolder,
form,
kubeflowJobName,
curTrialSequenceId,
trialJobOutputUrl
);
// Generate kubeflow job resource config object // Generate kubeflow job resource config object
const kubeflowJobConfig: any = this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources, nonWorkerResources); const kubeflowJobConfig: any = this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources, nonWorkerResources);
// Create kubeflow job based on generated kubeflow job resource config return Promise.resolve(kubeflowJobConfig);
await this.operatorClient.createKubeflowJob(kubeflowJobConfig);
// Set trial job detail until create Kubeflow job successfully
this.trialJobsMap.set(trialJobId, trialJobDetail);
return Promise.resolve(trialJobDetail);
}
public generatePodResource(memory: number, cpuNum: number, gpuNum: number) {
return {
'memory': `${memory}Mi`,
'cpu': `${cpuNum}`,
'nvidia.com/gpu': `${gpuNum}`
}
}
public updateTrialJob(trialJobId: string, form: JobApplicationForm): Promise<TrialJobDetail> {
throw new MethodNotImplementedError();
}
public listTrialJobs(): Promise<TrialJobDetail[]> {
const jobs: TrialJobDetail[] = [];
this.trialJobsMap.forEach(async (value: KubeflowTrialJobDetail, key: string) => {
if (value.form.jobType === 'TRIAL') {
jobs.push(await this.getTrialJob(key));
}
});
return Promise.resolve(jobs);
}
public getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
if(!this.kubeflowClusterConfig) {
throw new Error('Kubeflow Cluster config is not initialized');
}
const kubeflowTrialJob: TrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if (!kubeflowTrialJob) {
return Promise.reject(`trial job ${trialJobId} not found`)
}
return Promise.resolve(kubeflowTrialJob);
}
public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void) {
this.metricsEmitter.on('metric', listener);
}
public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void) {
this.metricsEmitter.off('metric', listener);
}
public get isMultiPhaseJobSupported(): boolean {
return false;
}
public async cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise<void> {
const trialJobDetail : KubeflowTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if(!trialJobDetail) {
const errorMessage: string = `CancelTrialJob: trial job id ${trialJobId} not found`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
if(!this.operatorClient) {
const errorMessage: string = `CancelTrialJob: trial job id ${trialJobId} failed because operatorClient is undefined`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
try {
await this.operatorClient.deleteKubeflowJob(new Map(
[
['app', this.NNI_KUBEFLOW_TRIAL_LABEL],
['expId', getExperimentId()],
['trialId', trialJobId]
]
));
} catch(err) {
const errorMessage: string = `Delete trial ${trialJobId} failed: ${err}`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
trialJobDetail.endTime = Date.now();
trialJobDetail.status = getJobCancelStatus(isEarlyStopped);
return Promise.resolve();
} }
public async setClusterMetadata(key: string, value: string): Promise<void> { public async setClusterMetadata(key: string, value: string): Promise<void> {
...@@ -339,86 +271,25 @@ class KubeflowTrainingService implements TrainingService { ...@@ -339,86 +271,25 @@ class KubeflowTrainingService implements TrainingService {
case TrialConfigMetadataKey.KUBEFLOW_CLUSTER_CONFIG: case TrialConfigMetadataKey.KUBEFLOW_CLUSTER_CONFIG:
let kubeflowClusterJsonObject = JSON.parse(value); let kubeflowClusterJsonObject = JSON.parse(value);
let kubeflowClusterConfigBase: KubeflowClusterConfigBase this.kubeflowClusterConfig = KubeflowClusterConfigFactory.generateKubeflowClusterConfig(kubeflowClusterJsonObject);
= new KubeflowClusterConfigBase(kubeflowClusterJsonObject.operator, kubeflowClusterJsonObject.apiVersion, kubeflowClusterJsonObject.storage); if(this.kubeflowClusterConfig.storageType === 'azureStorage') {
let azureKubeflowClusterConfig = <KubeflowClusterConfigAzure>this.kubeflowClusterConfig;
if(kubeflowClusterConfigBase && kubeflowClusterConfigBase.storage === 'azureStorage') {
const azureKubeflowClusterConfig: KubeflowClusterConfigAzure =
new KubeflowClusterConfigAzure(kubeflowClusterJsonObject.operator,
kubeflowClusterJsonObject.apiVersion,
kubeflowClusterJsonObject.keyVault,
kubeflowClusterJsonObject.azureStorage, kubeflowClusterJsonObject.storage);
const vaultName = azureKubeflowClusterConfig.keyVault.vaultName;
const valutKeyName = azureKubeflowClusterConfig.keyVault.name;
this.azureStorageAccountName = azureKubeflowClusterConfig.azureStorage.accountName; this.azureStorageAccountName = azureKubeflowClusterConfig.azureStorage.accountName;
this.azureStorageShare = azureKubeflowClusterConfig.azureStorage.azureShare; this.azureStorageShare = azureKubeflowClusterConfig.azureStorage.azureShare;
try { await this.createAzureStorage(
const result = await cpp.exec(`az keyvault secret show --name ${valutKeyName} --vault-name ${vaultName}`); azureKubeflowClusterConfig.keyVault.vaultName,
if(result.stderr) { azureKubeflowClusterConfig.keyVault.name,
const errorMessage: string = result.stderr; azureKubeflowClusterConfig.azureStorage.accountName,
this.log.error(errorMessage); azureKubeflowClusterConfig.azureStorage.azureShare
return Promise.reject(errorMessage); );
} } else if(this.kubeflowClusterConfig.storageType === 'nfs') {
const storageAccountKey =JSON.parse(result.stdout).value; let nfsKubeflowClusterConfig = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig;
//create storage client await this.createNFSStorage(
this.azureStorageClient = azure.createFileService(this.azureStorageAccountName, storageAccountKey); nfsKubeflowClusterConfig.nfs.server,
await AzureStorageClientUtility.createShare(this.azureStorageClient, this.azureStorageShare); nfsKubeflowClusterConfig.nfs.path
//create sotrage secret
this.azureStorageSecretName = 'nni-secret-' + uniqueString(8).toLowerCase();
await this.genericK8sClient.createSecret(
{
apiVersion: 'v1',
kind: 'Secret',
metadata: {
name: this.azureStorageSecretName,
namespace: 'default',
labels: {
app: this.NNI_KUBEFLOW_TRIAL_LABEL,
expId: getExperimentId()
}
},
type: 'Opaque',
data: {
azurestorageaccountname: base64.encode(this.azureStorageAccountName),
azurestorageaccountkey: base64.encode(storageAccountKey)
}
}
); );
} catch(error) {
this.log.error(error);
throw new Error(error);
}
this.kubeflowClusterConfig = azureKubeflowClusterConfig;
} else if(kubeflowClusterConfigBase && (kubeflowClusterConfigBase.storage === 'nfs' || kubeflowClusterConfigBase.storage === undefined)) {
//Check and mount NFS mount point here
//If storage is undefined, the default value is nfs
const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS =
new KubeflowClusterConfigNFS(kubeflowClusterJsonObject.operator,
kubeflowClusterJsonObject.apiVersion,
kubeflowClusterJsonObject.nfs,
kubeflowClusterJsonObject.storage);
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}`);
const nfsServer: string = nfsKubeflowClusterConfig.nfs.server;
const nfsPath: string = nfsKubeflowClusterConfig.nfs.path;
try {
await cpp.exec(`sudo mount ${nfsServer}:${nfsPath} ${this.trialLocalNFSTempFolder}`);
} catch(error) {
const mountError: string = `Mount NFS ${nfsServer}:${nfsPath} to ${this.trialLocalNFSTempFolder} failed, error is ${error}`;
this.log.error(mountError);
throw new Error(mountError);
}
this.kubeflowClusterConfig = nfsKubeflowClusterConfig;
} else {
const error: string = `kubeflowClusterConfig format error!`;
this.log.error(error);
throw new Error(error);
} }
this.kubernetesCRDClient = KubeflowOperatorClient.generateOperatorClient(this.kubeflowClusterConfig.operator,
this.operatorClient = KubeflowOperatorClient.generateOperatorClient(this.kubeflowClusterConfig.operator,
this.kubeflowClusterConfig.apiVersion); this.kubeflowClusterConfig.apiVersion);
break; break;
...@@ -430,18 +301,10 @@ class KubeflowTrainingService implements TrainingService { ...@@ -430,18 +301,10 @@ class KubeflowTrainingService implements TrainingService {
assert(this.kubeflowClusterConfig !== undefined) assert(this.kubeflowClusterConfig !== undefined)
let kubeflowTrialJsonObjsect = JSON.parse(value); let kubeflowTrialJsonObjsect = JSON.parse(value);
if(this.kubeflowClusterConfig.operator === 'tf-operator'){ this.kubeflowTrialConfig = KubeflowTrialConfigFactory.generateKubeflowTrialConfig(
this.kubeflowTrialConfig = new KubeflowTrialConfigTensorflow(kubeflowTrialJsonObjsect.codeDir, kubeflowTrialJsonObjsect,
kubeflowTrialJsonObjsect.worker, kubeflowTrialJsonObjsect.ps); this.kubeflowClusterConfig.operator
}else if(this.kubeflowClusterConfig.operator === 'pytorch-operator'){ );
this.kubeflowTrialConfig = new KubeflowTrialConfigPytorch(kubeflowTrialJsonObjsect.codeDir,
kubeflowTrialJsonObjsect.master, kubeflowTrialJsonObjsect.worker);
}
if (!this.kubeflowTrialConfig){
this.log.error('kubeflow kubeflow TrialConfig is not initialized');
return Promise.reject(new Error('kubeflow kubeflow TrialConfig is not initialized'));
}
// Validate to make sure codeDir doesn't have too many files // Validate to make sure codeDir doesn't have too many files
try { try {
...@@ -458,61 +321,6 @@ class KubeflowTrainingService implements TrainingService { ...@@ -458,61 +321,6 @@ class KubeflowTrainingService implements TrainingService {
return Promise.resolve(); return Promise.resolve();
} }
public getClusterMetadata(key: string): Promise<string> {
return Promise.resolve('');
}
public async cleanUp(): Promise<void> {
this.stopping = true;
// First, cancel all running kubeflow jobs
for(let [trialJobId, kubeflowTrialJob] of this.trialJobsMap) {
if(['RUNNING', 'WAITING', 'UNKNOWN'].includes(kubeflowTrialJob.status)) {
try {
await this.cancelTrialJob(trialJobId);
} catch(error) {} // DONT throw error during cleanup
kubeflowTrialJob.status = 'SYS_CANCELED';
}
}
// Delete all kubeflow jobs whose expId label is current experiment id
try {
if(this.operatorClient) {
await this.operatorClient.deleteKubeflowJob(new Map(
[
['app', this.NNI_KUBEFLOW_TRIAL_LABEL],
['expId', getExperimentId()]
]
));
}
} catch(error) {
this.log.error(`Delete kubeflow job with label: app=${this.NNI_KUBEFLOW_TRIAL_LABEL},expId=${getExperimentId()} failed, error is ${error}`);
}
// Unmount NFS
try {
await cpp.exec(`sudo umount ${this.trialLocalNFSTempFolder}`);
} catch(error) {
this.log.error(`Unmount ${this.trialLocalNFSTempFolder} failed, error is ${error}`);
}
// Stop Kubeflow rest server
const restServer: KubeflowJobRestServer = component.get(KubeflowJobRestServer);
try {
await restServer.stop();
this.log.info('Kubeflow Training service rest server stopped successfully.');
} catch (error) {
this.log.error(`Kubeflow Training service rest server stopped failed, error: ${error.message}`);
Promise.reject(error);
}
return Promise.resolve();
}
public get MetricsEmitter() : EventEmitter {
return this.metricsEmitter;
}
/** /**
* Generate kubeflow resource config file * Generate kubeflow resource config file
* @param trialJobId trial job id * @param trialJobId trial job id
...@@ -530,14 +338,14 @@ class KubeflowTrainingService implements TrainingService { ...@@ -530,14 +338,14 @@ class KubeflowTrainingService implements TrainingService {
throw new Error('Kubeflow trial config is not initialized'); throw new Error('Kubeflow trial config is not initialized');
} }
if(!this.operatorClient) { if(!this.kubernetesCRDClient) {
throw new Error('Kubeflow operator client is not initialized'); throw new Error('Kubeflow operator client is not initialized');
} }
const replicaSpecsObj: any = {}; const replicaSpecsObj: any = {};
let replicaSpecsObjMap = new Map<string, object>(); let replicaSpecsObjMap = new Map<string, object>();
if(this.kubeflowClusterConfig.operator === 'tf-operator') { if(this.kubeflowTrialConfig.operatorType === 'tf-operator') {
let tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig; let tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas, replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas,
tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources); tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources);
...@@ -546,9 +354,9 @@ class KubeflowTrainingService implements TrainingService { ...@@ -546,9 +354,9 @@ class KubeflowTrainingService implements TrainingService {
replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas, replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas,
tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources); tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources);
} }
replicaSpecsObjMap.set(this.operatorClient.jobKind, {'tfReplicaSpecs': replicaSpecsObj}) replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {'tfReplicaSpecs': replicaSpecsObj})
} }
else if(this.kubeflowClusterConfig.operator === 'pytorch-operator') { else if(this.kubeflowTrialConfig.operatorType === 'pytorch-operator') {
let pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig; let pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
if(pytorchTrialConfig.worker) { if(pytorchTrialConfig.worker) {
replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas, replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas,
...@@ -557,22 +365,22 @@ class KubeflowTrainingService implements TrainingService { ...@@ -557,22 +365,22 @@ class KubeflowTrainingService implements TrainingService {
replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas, replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas,
pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources); pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources);
replicaSpecsObjMap.set(this.operatorClient.jobKind, {'pytorchReplicaSpecs': replicaSpecsObj}) replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {'pytorchReplicaSpecs': replicaSpecsObj})
} }
return { return {
apiVersion: `kubeflow.org/${this.operatorClient.apiVersion}`, apiVersion: `kubeflow.org/${this.kubernetesCRDClient.apiVersion}`,
kind: this.operatorClient.jobKind, kind: this.kubernetesCRDClient.jobKind,
metadata: { metadata: {
name: kubeflowJobName, name: kubeflowJobName,
namespace: 'default', namespace: 'default',
labels: { labels: {
app: this.NNI_KUBEFLOW_TRIAL_LABEL, app: this.NNI_KUBERNETES_TRIAL_LABEL,
expId: getExperimentId(), expId: getExperimentId(),
trialId: trialJobId trialId: trialJobId
} }
}, },
spec: replicaSpecsObjMap.get(this.operatorClient.jobKind) spec: replicaSpecsObjMap.get(this.kubernetesCRDClient.jobKind)
}; };
} }
...@@ -593,12 +401,12 @@ class KubeflowTrainingService implements TrainingService { ...@@ -593,12 +401,12 @@ class KubeflowTrainingService implements TrainingService {
throw new Error('Kubeflow trial config is not initialized'); throw new Error('Kubeflow trial config is not initialized');
} }
if(!this.operatorClient) { if(!this.kubernetesCRDClient) {
throw new Error('Kubeflow operator client is not initialized'); throw new Error('Kubeflow operator client is not initialized');
} }
let volumeSpecMap = new Map<string, object>(); let volumeSpecMap = new Map<string, object>();
if(this.kubeflowClusterConfig.storage && this.kubeflowClusterConfig.storage === 'azureStorage'){ if(this.kubeflowClusterConfig.storageType === 'azureStorage'){
volumeSpecMap.set('nniVolumes', [ volumeSpecMap.set('nniVolumes', [
{ {
name: 'nni-vol', name: 'nni-vol',
...@@ -631,7 +439,7 @@ class KubeflowTrainingService implements TrainingService { ...@@ -631,7 +439,7 @@ class KubeflowTrainingService implements TrainingService {
{ {
// Kubeflow tensorflow operator requires that containers' name must be tensorflow // Kubeflow tensorflow operator requires that containers' name must be tensorflow
// TODO: change the name based on operator's type // TODO: change the name based on operator's type
name: this.operatorClient.containerName, name: this.kubernetesCRDClient.containerName,
image: replicaImage, image: replicaImage,
args: ["sh", `${path.join(trialWorkingFolder, runScriptFile)}`], args: ["sh", `${path.join(trialWorkingFolder, runScriptFile)}`],
volumeMounts: [ volumeMounts: [
...@@ -647,55 +455,6 @@ class KubeflowTrainingService implements TrainingService { ...@@ -647,55 +455,6 @@ class KubeflowTrainingService implements TrainingService {
} }
}; };
} }
/**
* Genereate run script for different roles(like worker or ps)
* @param trialJobId trial job id
* @param trialWorkingFolder working folder
* @param command
* @param trialSequenceId sequence id
*/
private generateRunScript(trialJobId: string, trialWorkingFolder: string,
command: string, trialSequenceId: string, roleType: DistTrainRole, gpuNum: number): string {
const runScriptLines: string[] = [];
runScriptLines.push('#!/bin/bash');
runScriptLines.push('export NNI_PLATFORM=kubeflow');
runScriptLines.push(`export NNI_SYS_DIR=$PWD/nni/${trialJobId}`);
runScriptLines.push(`export NNI_OUTPUT_DIR=${path.join(trialWorkingFolder, 'output', `${roleType}_output`)}`);
runScriptLines.push('export MULTI_PHASE=false');
runScriptLines.push(`export NNI_TRIAL_JOB_ID=${trialJobId}`);
runScriptLines.push(`export NNI_EXP_ID=${getExperimentId()}`);
runScriptLines.push(`export NNI_CODE_DIR=${trialWorkingFolder}`);
runScriptLines.push(`export NNI_TRIAL_SEQ_ID=${trialSequenceId}`);
// Nvidia devcie plugin for K8S has a known issue that requesting zero GPUs allocates all GPUs
// Refer https://github.com/NVIDIA/k8s-device-plugin/issues/61
// So we have to explicitly set CUDA_VISIBLE_DEVICES to empty if user sets gpuNum to 0 in NNI config file
if(gpuNum === 0) {
runScriptLines.push(`export CUDA_VISIBLE_DEVICES=''`);
}
const nniManagerIp = this.nniManagerIpConfig?this.nniManagerIpConfig.nniManagerIp:getIPV4Address();
runScriptLines.push('mkdir -p $NNI_SYS_DIR');
runScriptLines.push('mkdir -p $NNI_OUTPUT_DIR');
runScriptLines.push('cp -rT $NNI_CODE_DIR $NNI_SYS_DIR');
runScriptLines.push('cd $NNI_SYS_DIR');
runScriptLines.push('sh install_nni.sh # Check and install NNI pkg');
runScriptLines.push(`python3 -m nni_trial_tool.trial_keeper --trial_command '${command}' `
+ `--nnimanager_ip '${nniManagerIp}' --nnimanager_port '${this.kubeflowRestServerPort}' `
+ `1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`);
return runScriptLines.join('\n');
}
private generateSequenceId(): number {
if (this.nextTrialSequenceId === -1) {
this.nextTrialSequenceId = getInitTrialSequenceId();
}
return this.nextTrialSequenceId++;
}
} }
export { KubeflowTrainingService } export { KubeflowTrainingService }
...@@ -19,11 +19,9 @@ ...@@ -19,11 +19,9 @@
'use strict'; 'use strict';
import * as fs from 'fs';
import * as os from 'os' import * as os from 'os'
import * as path from 'path'; import * as path from 'path';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { KubeflowOperator, OperatorApiVersion } from './kubeflowConfig';
var K8SClient = require('kubernetes-client').Client; var K8SClient = require('kubernetes-client').Client;
var K8SConfig = require('kubernetes-client').config; var K8SConfig = require('kubernetes-client').config;
...@@ -52,7 +50,7 @@ class GeneralK8sClient { ...@@ -52,7 +50,7 @@ class GeneralK8sClient {
} }
} }
abstract class KubeflowOperatorClient { abstract class KubernetesCRDClient {
protected readonly client: any; protected readonly client: any;
protected readonly log: Logger = getLogger(); protected readonly log: Logger = getLogger();
protected crdSchema: any; protected crdSchema: any;
...@@ -66,28 +64,6 @@ abstract class KubeflowOperatorClient { ...@@ -66,28 +64,6 @@ abstract class KubeflowOperatorClient {
public abstract get containerName(): string; public abstract get containerName(): string;
/**
* Factory method to generate operator cliet
*/
public static generateOperatorClient(kubeflowOperator: KubeflowOperator,
operatorApiVersion: OperatorApiVersion): KubeflowOperatorClient {
if(kubeflowOperator === 'tf-operator') {
if(operatorApiVersion == 'v1alpha2') {
return new TFOperatorClientV1Alpha2();
} else if(operatorApiVersion == 'v1beta1') {
return new TFOperatorClientV1Beta1();
}
} else if(kubeflowOperator === 'pytorch-operator') {
if(operatorApiVersion == 'v1alpha2') {
return new PytorchOperatorClientV1Alpha2();
} else if(operatorApiVersion == 'v1beta1') {
return new PytorchOperatorClientV1Beta1();
}
}
throw new Error(`Invalid operator ${kubeflowOperator} or apiVersion ${operatorApiVersion}`);
}
public get jobKind(): string { public get jobKind(): string {
if(this.crdSchema if(this.crdSchema
&& this.crdSchema.spec && this.crdSchema.spec
...@@ -109,19 +85,19 @@ abstract class KubeflowOperatorClient { ...@@ -109,19 +85,19 @@ abstract class KubeflowOperatorClient {
} }
} }
public async createKubeflowJob(jobManifest: any): Promise<boolean> { public async createKubernetesJob(jobManifest: any): Promise<boolean> {
let result: Promise<boolean>; let result: Promise<boolean>;
const response : any = await this.operator.post({body: jobManifest}); const response : any = await this.operator.post({body: jobManifest});
if(response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) { if(response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) {
result = Promise.resolve(true); result = Promise.resolve(true);
} else { } else {
result = Promise.reject(`KubeflowOperatorClient create tfjobs failed, statusCode is ${response.statusCode}`); result = Promise.reject(`Create kubernetes job failed, statusCode is ${response.statusCode}`);
} }
return result; return result;
} }
//TODO : replace any //TODO : replace any
public async getKubeflowJob(kubeflowJobName: string): Promise<any> { public async getKubernetesJob(kubeflowJobName: string): Promise<any> {
let result: Promise<any>; let result: Promise<any>;
const response : any = await this.operator(kubeflowJobName).get(); const response : any = await this.operator(kubeflowJobName).get();
if(response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) { if(response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) {
...@@ -132,12 +108,17 @@ abstract class KubeflowOperatorClient { ...@@ -132,12 +108,17 @@ abstract class KubeflowOperatorClient {
return result; return result;
} }
public async deleteKubeflowJob(labels: Map<string, string>): Promise<boolean> { public async deleteKubernetesJob(labels: Map<string, string>): Promise<boolean> {
let result: Promise<boolean>; let result: Promise<boolean>;
// construct match query from labels for deleting tfjob // construct match query from labels for deleting tfjob
const matchQuery: string = Array.from(labels.keys()).map(labelKey => `${labelKey}=${labels.get(labelKey)}`).join(','); const matchQuery: string = Array.from(labels.keys()).map(labelKey => `${labelKey}=${labels.get(labelKey)}`).join(',');
try { try {
const deleteResult : any = await this.operator().delete({ qs: { labelSelector: matchQuery } }); const deleteResult : any = await this.operator().delete({
qs: {
labelSelector: matchQuery,
propagationPolicy: "Background"
}
});
if(deleteResult.statusCode && deleteResult.statusCode >= 200 && deleteResult.statusCode <= 299) { if(deleteResult.statusCode && deleteResult.statusCode >= 200 && deleteResult.statusCode <= 299) {
result = Promise.resolve(true); result = Promise.resolve(true);
} else { } else {
...@@ -151,80 +132,4 @@ abstract class KubeflowOperatorClient { ...@@ -151,80 +132,4 @@ abstract class KubeflowOperatorClient {
} }
} }
class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient { export { KubernetesCRDClient, GeneralK8sClient };
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/tfjob-crd-v1alpha2.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1alpha2.namespaces('default').tfjobs;
}
public get containerName(): string {
return 'tensorflow';
}
}
class TFOperatorClientV1Beta1 extends KubeflowOperatorClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/tfjob-crd-v1beta1.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta1.namespaces('default').tfjobs;
}
public get containerName(): string {
return 'tensorflow';
}
}
class PytorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/pytorchjob-crd-v1alpha2.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1alpha2.namespaces('default').pytorchjobs;
}
public get containerName(): string {
return 'pytorch';
}
}
class PytorchOperatorClientV1Beta1 extends KubeflowOperatorClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/pytorchjob-crd-v1beta1.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta1.namespaces('default').pytorchjobs;
}
public get containerName(): string {
return 'pytorch';
}
}
export { KubeflowOperatorClient, GeneralK8sClient };
...@@ -19,59 +19,99 @@ ...@@ -19,59 +19,99 @@
'use strict'; 'use strict';
/** operator types that kubeflow supported */ export type KubernetesStorageKind = 'nfs' | 'azureStorage';
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ; import { MethodNotImplementedError } from '../../common/errors';
export type KubeflowStorageKind = 'nfs' | 'azureStorage';
export type DistTrainRole = 'worker' | 'ps' | 'master';
export type OperatorApiVersion = 'v1alpha2' | 'v1beta1';
/** export abstract class KubernetesClusterConfig {
* Kuberflow cluster configuration public readonly storage?: KubernetesStorageKind;
* public readonly apiVersion: string;
*/
export class KubeflowClusterConfigBase { constructor(apiVersion: string, storage?: KubernetesStorageKind) {
/** Name of Kubeflow operator, like tf-operator */ this.storage = storage;
public readonly operator: KubeflowOperator;
public readonly apiVersion: OperatorApiVersion;
public readonly storage?: KubeflowStorageKind;
/**
* Constructor
* @param userName User name of Kubeflow Cluster
* @param passWord password of Kubeflow Cluster
* @param host Host IP of Kubeflow Cluster
*/
constructor(operator: KubeflowOperator, apiVersion: OperatorApiVersion, storage?: KubeflowStorageKind) {
this.operator = operator;
this.apiVersion = apiVersion; this.apiVersion = apiVersion;
}
public get storageType(): KubernetesStorageKind{
throw new MethodNotImplementedError();
}
}
export class StorageConfig {
public readonly storage?: KubernetesStorageKind;
constructor(storage?: KubernetesStorageKind) {
this.storage = storage; this.storage = storage;
} }
} }
export class KubeflowClusterConfigNFS extends KubeflowClusterConfigBase{ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
public readonly nfs: NFSConfig; public readonly nfs: NFSConfig;
constructor(operator: KubeflowOperator, constructor(
apiVersion: OperatorApiVersion, apiVersion: string,
nfs: NFSConfig, storage?: KubeflowStorageKind) { nfs: NFSConfig,
super(operator, apiVersion, storage); storage?: KubernetesStorageKind
) {
super(apiVersion, storage);
this.nfs = nfs; this.nfs = nfs;
} }
public get storageType(): KubernetesStorageKind{
return 'nfs';
}
public static getInstance(jsonObject: object): KubernetesClusterConfigNFS {
let kubernetesClusterConfigObjectNFS = <KubernetesClusterConfigNFS>jsonObject;
return new KubernetesClusterConfigNFS(
kubernetesClusterConfigObjectNFS.apiVersion,
kubernetesClusterConfigObjectNFS.nfs,
kubernetesClusterConfigObjectNFS.storage
);
}
} }
export class KubeflowClusterConfigAzure extends KubeflowClusterConfigBase{ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
public readonly keyVault: keyVaultConfig; public readonly keyVault: keyVaultConfig;
public readonly azureStorage: AzureStorage; public readonly azureStorage: AzureStorage;
constructor(operator: KubeflowOperator, constructor(
apiVersion: OperatorApiVersion, apiVersion: string,
keyVault: keyVaultConfig, keyVault: keyVaultConfig,
azureStorage: AzureStorage, azureStorage: AzureStorage,
storage?: KubeflowStorageKind) { storage?: KubernetesStorageKind
super(operator, apiVersion, storage); ) {
super(apiVersion, storage);
this.keyVault = keyVault; this.keyVault = keyVault;
this.azureStorage = azureStorage; this.azureStorage = azureStorage;
} }
public get storageType(): KubernetesStorageKind{
return 'azureStorage';
}
public static getInstance(jsonObject: object): KubernetesClusterConfigAzure {
let kubernetesClusterConfigObjectAzure = <KubernetesClusterConfigAzure>jsonObject;
return new KubernetesClusterConfigAzure(
kubernetesClusterConfigObjectAzure.apiVersion,
kubernetesClusterConfigObjectAzure.keyVault,
kubernetesClusterConfigObjectAzure.azureStorage,
kubernetesClusterConfigObjectAzure.storage
);
}
}
export class KubernetesClusterConfigFactory {
public static generateKubernetesClusterConfig(jsonObject: object): KubernetesClusterConfig {
let storageConfig = <StorageConfig>jsonObject;
switch(storageConfig.storage) {
case 'azureStorage':
return KubernetesClusterConfigAzure.getInstance(jsonObject);
case 'nfs' || undefined :
return KubernetesClusterConfigNFS.getInstance(jsonObject);
}
throw new Error(`Invalid json object ${jsonObject}`);
}
} }
/** /**
...@@ -121,12 +161,9 @@ export class AzureStorage { ...@@ -121,12 +161,9 @@ export class AzureStorage {
} }
/** /**
* Trial job configuration for Kubeflow * Trial job configuration for Kubernetes
*/ */
export class KubeflowTrialConfigTemplate { export class KubernetesTrialConfigTemplate {
/** replication number of current role */
public readonly replicas: number;
/** CPU number */ /** CPU number */
public readonly cpuNum: number; public readonly cpuNum: number;
...@@ -142,9 +179,8 @@ export class KubeflowTrialConfigTemplate { ...@@ -142,9 +179,8 @@ export class KubeflowTrialConfigTemplate {
/** Required GPU number for trial job. The number should be in [0,100] */ /** Required GPU number for trial job. The number should be in [0,100] */
public readonly gpuNum : number; public readonly gpuNum : number;
constructor(replicas: number, command : string, gpuNum : number, constructor(command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) { cpuNum: number, memoryMB: number, image: string) {
this.replicas = replicas;
this.command = command; this.command = command;
this.gpuNum = gpuNum; this.gpuNum = gpuNum;
this.cpuNum = cpuNum; this.cpuNum = cpuNum;
...@@ -153,33 +189,10 @@ export class KubeflowTrialConfigTemplate { ...@@ -153,33 +189,10 @@ export class KubeflowTrialConfigTemplate {
} }
} }
export class KubeflowTrialConfigBase { export class KubernetesTrialConfig {
public readonly codeDir: string; public readonly codeDir: string;
constructor(codeDir: string) { constructor(codeDir: string) {
this.codeDir = codeDir; this.codeDir = codeDir;
} }
} }
\ No newline at end of file
export class KubeflowTrialConfigTensorflow extends KubeflowTrialConfigBase{
public readonly ps?: KubeflowTrialConfigTemplate;
public readonly worker: KubeflowTrialConfigTemplate;
constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, ps?: KubeflowTrialConfigTemplate) {
super(codeDir);
this.ps = ps;
this.worker = worker;
}
}
export class KubeflowTrialConfigPytorch extends KubeflowTrialConfigBase{
public readonly master: KubeflowTrialConfigTemplate;
public readonly worker?: KubeflowTrialConfigTemplate;
constructor(codeDir: string, master: KubeflowTrialConfigTemplate, worker?: KubeflowTrialConfigTemplate) {
super(codeDir);
this.master = master;
this.worker = worker;
}
}
...@@ -25,7 +25,7 @@ import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../commo ...@@ -25,7 +25,7 @@ import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../commo
* KubeflowTrialJobDetail * KubeflowTrialJobDetail
*/ */
// tslint:disable-next-line:max-classes-per-file // tslint:disable-next-line:max-classes-per-file
export class KubeflowTrialJobDetail implements TrialJobDetail { export class KubernetesTrialJobDetail implements TrialJobDetail {
public id: string; public id: string;
public status: TrialJobStatus; public status: TrialJobStatus;
public submitTime: number; public submitTime: number;
...@@ -35,19 +35,19 @@ export class KubeflowTrialJobDetail implements TrialJobDetail { ...@@ -35,19 +35,19 @@ export class KubeflowTrialJobDetail implements TrialJobDetail {
public url?: string; public url?: string;
public workingDirectory: string; public workingDirectory: string;
public form: JobApplicationForm; public form: JobApplicationForm;
public kubeflowJobName: string; public kubernetesJobName: string;
public sequenceId: number; public sequenceId: number;
public queryJobFailedCount: number; public queryJobFailedCount: number;
constructor(id: string, status: TrialJobStatus, submitTime: number, constructor(id: string, status: TrialJobStatus, submitTime: number,
workingDirectory: string, form: JobApplicationForm, workingDirectory: string, form: JobApplicationForm,
kubeflowJobName: string, sequenceId: number, url: string) { kubernetesJobName: string, sequenceId: number, url: string) {
this.id = id; this.id = id;
this.status = status; this.status = status;
this.submitTime = submitTime; this.submitTime = submitTime;
this.workingDirectory = workingDirectory; this.workingDirectory = workingDirectory;
this.form = form; this.form = form;
this.kubeflowJobName = kubeflowJobName; this.kubernetesJobName = kubernetesJobName;
this.sequenceId = sequenceId; this.sequenceId = sequenceId;
this.tags = []; this.tags = [];
this.queryJobFailedCount = 0; this.queryJobFailedCount = 0;
...@@ -55,4 +55,21 @@ export class KubeflowTrialJobDetail implements TrialJobDetail { ...@@ -55,4 +55,21 @@ export class KubeflowTrialJobDetail implements TrialJobDetail {
} }
} }
export type KubeflowTFJobType = 'Created' | 'Running' | 'Failed' | 'Succeeded'; export const KubernetesScriptFormat =
\ No newline at end of file `#!/bin/bash
export NNI_PLATFORM={0}
export NNI_SYS_DIR=$PWD/nni/{1}
export NNI_OUTPUT_DIR={2}
export MULTI_PHASE=false
export NNI_TRIAL_JOB_ID={3}
export NNI_EXP_ID={4}
export NNI_CODE_DIR={5}
export NNI_TRIAL_SEQ_ID={6}
{7}
mkdir -p $NNI_SYS_DIR
mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} `
+ `1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
...@@ -20,88 +20,45 @@ ...@@ -20,88 +20,45 @@
'use strict'; 'use strict';
import * as assert from 'assert'; import * as assert from 'assert';
import * as cpp from 'child-process-promise';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { KubeflowTrialJobDetail, KubeflowTFJobType} from './kubeflowData';
import { NNIError, NNIErrorNames } from '../../common/errors'; import { NNIError, NNIErrorNames } from '../../common/errors';
import { TrialJobStatus } from '../../common/trainingService'; import { TrialJobStatus } from '../../common/trainingService';
import { KubeflowOperatorClient } from './kubernetesApiClient'; import { KubernetesCRDClient } from './kubernetesApiClient';
import { MethodNotImplementedError } from '../../common/errors';
import { KubernetesTrialJobDetail } from './kubernetesData';
/** /**
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally * Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
*/ */
export class KubeflowJobInfoCollector { export class KubernetesJobInfoCollector {
private readonly trialJobsMap : Map<string, KubeflowTrialJobDetail>; protected readonly trialJobsMap : Map<string, KubernetesTrialJobDetail>;
private readonly log: Logger = getLogger(); protected readonly log: Logger = getLogger();
private readonly statusesNeedToCheck: TrialJobStatus[]; protected readonly statusesNeedToCheck: TrialJobStatus[];
constructor(jobMap: Map<string, KubeflowTrialJobDetail>) { constructor(jobMap: Map<string, KubernetesTrialJobDetail>) {
this.trialJobsMap = jobMap; this.trialJobsMap = jobMap;
this.statusesNeedToCheck = ['RUNNING', 'WAITING']; this.statusesNeedToCheck = ['RUNNING', 'WAITING'];
} }
public async retrieveTrialStatus(operatorClient: KubeflowOperatorClient | undefined) : Promise<void> { public async retrieveTrialStatus(kubernetesCRDClient: KubernetesCRDClient | undefined) : Promise<void> {
assert(operatorClient !== undefined); assert(kubernetesCRDClient !== undefined);
const updateKubeflowTrialJobs : Promise<void>[] = []; const updateKubernetesTrialJobs : Promise<void>[] = [];
for(let [trialJobId, kubeflowTrialJob] of this.trialJobsMap) { for(let [trialJobId, kubernetesTrialJob] of this.trialJobsMap) {
if (!kubeflowTrialJob) { if (!kubernetesTrialJob) {
throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`); throw new NNIError(NNIErrorNames.NOT_FOUND, `trial job id ${trialJobId} not found`);
} }
// Since Kubeflow needs some delay to schedule jobs, we provide 20 seconds buffer time to check kubeflow job's status // Since Kubeflow needs some delay to schedule jobs, we provide 20 seconds buffer time to check kubeflow job's status
if( Date.now() - kubeflowTrialJob.submitTime < 20 * 1000) { if( Date.now() - kubernetesTrialJob.submitTime < 20 * 1000) {
return Promise.resolve(); return Promise.resolve();
} }
updateKubeflowTrialJobs.push(this.retrieveSingleTrialJobInfo(operatorClient, kubeflowTrialJob)) updateKubernetesTrialJobs.push(this.retrieveSingleTrialJobInfo(kubernetesCRDClient, kubernetesTrialJob))
} }
await Promise.all(updateKubeflowTrialJobs); await Promise.all(updateKubernetesTrialJobs);
} }
private async retrieveSingleTrialJobInfo(operatorClient: KubeflowOperatorClient | undefined, protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubeflowTrialJob : KubeflowTrialJobDetail) : Promise<void> { kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> {
if (!this.statusesNeedToCheck.includes(kubeflowTrialJob.status)) { throw new MethodNotImplementedError();
return Promise.resolve();
}
if(operatorClient === undefined) {
return Promise.reject('operatorClient is undefined');
}
let kubeflowJobInfo: any;
try {
kubeflowJobInfo = await operatorClient.getKubeflowJob(kubeflowTrialJob.kubeflowJobName);
} catch(error) {
this.log.error(`Get job ${kubeflowTrialJob.kubeflowJobName} info failed, error is ${error}`);
return Promise.resolve();
}
if(kubeflowJobInfo.status && kubeflowJobInfo.status.conditions) {
const latestCondition = kubeflowJobInfo.status.conditions[kubeflowJobInfo.status.conditions.length - 1];
const tfJobType : KubeflowTFJobType = <KubeflowTFJobType>latestCondition.type;
switch(tfJobType) {
case 'Created':
kubeflowTrialJob.status = 'WAITING';
kubeflowTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
case 'Running':
kubeflowTrialJob.status = 'RUNNING';
if(!kubeflowTrialJob.startTime) {
kubeflowTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
}
break;
case 'Failed':
kubeflowTrialJob.status = 'FAILED';
kubeflowTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
case 'Succeeded':
kubeflowTrialJob.status = 'SUCCEEDED';
kubeflowTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
default:
break;
}
}
return Promise.resolve();
} }
} }
\ No newline at end of file
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
import * as component from '../../common/component'; import * as component from '../../common/component';
import { Inject } from 'typescript-ioc'; import { Inject } from 'typescript-ioc';
import { KubeflowTrainingService } from './kubeflowTrainingService'; import { KubernetesTrainingService } from './kubernetesTrainingService';
import { ClusterJobRestServer } from '../common/clusterJobRestServer' import { ClusterJobRestServer } from '../common/clusterJobRestServer'
/** /**
...@@ -29,23 +29,26 @@ import { ClusterJobRestServer } from '../common/clusterJobRestServer' ...@@ -29,23 +29,26 @@ import { ClusterJobRestServer } from '../common/clusterJobRestServer'
* *
*/ */
@component.Singleton @component.Singleton
export class KubeflowJobRestServer extends ClusterJobRestServer{ export class KubernetesJobRestServer extends ClusterJobRestServer{
@Inject @Inject
private readonly kubeflowTrainingService : KubeflowTrainingService; private kubernetesTrainingService? : KubernetesTrainingService;
/** /**
* constructor to provide NNIRestServer's own rest property, e.g. port * constructor to provide NNIRestServer's own rest property, e.g. port
*/ */
constructor() { constructor(kubernetesTrainingService: KubernetesTrainingService) {
super(); super();
this.kubeflowTrainingService = component.get(KubeflowTrainingService); this.kubernetesTrainingService = kubernetesTrainingService;
} }
protected handleTrialMetrics(jobId : string, metrics : any[]) : void { protected handleTrialMetrics(jobId : string, metrics : any[]) : void {
if(!this.kubernetesTrainingService) {
throw Error('kubernetesTrainingService not initialized!');
}
// Split metrics array into single metric, then emit // Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN // Warning: If not split metrics into single ones, the behavior will be UNKNOWN
for (const singleMetric of metrics) { for (const singleMetric of metrics) {
this.kubeflowTrainingService.MetricsEmitter.emit('metric', { this.kubernetesTrainingService.MetricsEmitter.emit('metric', {
id : jobId, id : jobId,
data : singleMetric data : singleMetric
}); });
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment