kubeflowTrainingService.ts 24.8 KB
Newer Older
liuzhe-lz's avatar
liuzhe-lz committed
1
2
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
3

4
'use strict';
5
6
7
8
9

import * as assert from 'assert';
import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import * as path from 'path';
10
import * as component from '../../../common/component';
11
12
13

import { getExperimentId } from '../../../common/experimentStartupInfo';
import {
14
    NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail, TrialJobStatus
15
16
} from '../../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils';
17
18
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
19
import { validateCodeDir } from '../../common/util';
20
21
import { NFSConfig } from '../kubernetesConfig';
import { KubernetesTrialJobDetail } from '../kubernetesData';
22
import { KubernetesJobRestServer } from '../kubernetesJobRestServer';
23
import { KubernetesTrainingService } from '../kubernetesTrainingService';
24
import { KubeflowOperatorClientFactory } from './kubeflowApiClient';
25
26
27
import { KubeflowClusterConfig, KubeflowClusterConfigAzure, KubeflowClusterConfigFactory, KubeflowClusterConfigNFS,
    KubeflowTrialConfig, KubeflowTrialConfigFactory, KubeflowTrialConfigPytorch, KubeflowTrialConfigTensorflow
} from './kubeflowConfig';
28
import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector';
29
import { KubeflowJobRestServer } from './kubeflowJobRestServer';
30
31
32
33
34
35
36
37
38

/**
 * Training Service implementation for Kubeflow
 * Refer https://github.com/kubeflow/kubeflow for more info about Kubeflow
 */
@component.Singleton
class KubeflowTrainingService extends KubernetesTrainingService implements KubernetesTrainingService {
    private kubeflowClusterConfig?: KubeflowClusterConfig;
    private kubeflowTrialConfig?: KubeflowTrialConfig;
39
    private readonly kubeflowJobInfoCollector: KubeflowJobInfoCollector;
40
41

    constructor() {
42
        super();
43
        this.kubeflowJobInfoCollector = new KubeflowJobInfoCollector(this.trialJobsMap);
44
        this.experimentId = getExperimentId();
chicm-ms's avatar
chicm-ms committed
45
        this.log.info('Construct Kubeflow training service.');
46
47
48
    }

    public async run(): Promise<void> {
chicm-ms's avatar
chicm-ms committed
49
        this.log.info('Run Kubeflow training service.');
50
        this.kubernetesJobRestServer = new KubernetesJobRestServer(this);
51
        if (this.kubernetesJobRestServer === undefined) {
52
53
54
            throw new Error('kubernetesJobRestServer not initialized!');
        }
        await this.kubernetesJobRestServer.start();
55
        this.kubernetesJobRestServer.setEnableVersionCheck = this.versionCheck;
56
57
        this.log.info(`Kubeflow Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`);
        while (!this.stopping) {
58
            // collect metrics for Kubeflow jobs by interacting with Kubernetes API server
59
60
            await delay(3000);
            await this.kubeflowJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient);
61
            if (this.kubernetesJobRestServer.getErrorMessage !== undefined) {
62
63
64
                throw new Error(this.kubernetesJobRestServer.getErrorMessage);
                this.stopping = true;
            }
65
        }
chicm-ms's avatar
chicm-ms committed
66
        this.log.info('Kubeflow training service exit.');
67
68
    }

69
    public async submitTrialJob(form: TrialJobApplicationForm): Promise<TrialJobDetail> {
70
        if (this.kubernetesCRDClient === undefined) {
71
72
73
            throw new Error('Kubeflow job operator client is undefined');
        }

74
        if (this.kubernetesRestServerPort === undefined) {
75
76
77
            const restServer: KubeflowJobRestServer = component.get(KubeflowJobRestServer);
            this.kubernetesRestServerPort = restServer.clusterRestServerPort;
        }
78
79
80
81
82
83

        // upload code Dir to storage
        if (this.copyExpCodeDirPromise !== undefined) {
            await this.copyExpCodeDirPromise;
        }

84
85
        const trialJobId: string = uniqueString(5);
        const trialWorkingFolder: string = path.join(this.CONTAINER_MOUNT_PATH, 'nni', getExperimentId(), trialJobId);
86
        const kubeflowJobName: string = `nni-exp-${this.experimentId}-trial-${trialJobId}`.toLowerCase();
87
88
        const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
        //prepare the runscript
89
        await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, form);
90
91
        //upload script files to sotrage
        const trialJobOutputUrl: string = await this.uploadFolder(trialLocalTempFolder, `nni/${getExperimentId()}/${trialJobId}`);
92
93
94
95
        let initStatus: TrialJobStatus = 'WAITING';
        if (!trialJobOutputUrl) {
            initStatus = 'FAILED';
        }
96
97
        const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail(
            trialJobId,
98
            initStatus,
99
100
101
102
103
104
            Date.now(),
            trialWorkingFolder,
            form,
            kubeflowJobName,
            trialJobOutputUrl
        );
105
106

        // Generate kubeflow job resource config object
107
108
109
110
        const kubeflowJobConfig: any = await this.prepareKubeflowConfig(trialJobId, trialWorkingFolder, kubeflowJobName);
        // Create kubeflow job based on generated kubeflow job resource config
        await this.kubernetesCRDClient.createKubernetesJob(kubeflowJobConfig);

111
        // Set trial job detail until create Kubeflow job successfully
112
113
114
115
        this.trialJobsMap.set(trialJobId, trialJobDetail);

        return Promise.resolve(trialJobDetail);
    }
116

117
118
119
120
121
122
    public async setClusterMetadata(key: string, value: string): Promise<void> {
        switch (key) {
            case TrialConfigMetadataKey.NNI_MANAGER_IP:
                this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value);
                break;

chicm-ms's avatar
chicm-ms committed
123
            case TrialConfigMetadataKey.KUBEFLOW_CLUSTER_CONFIG: {
124
125
126
127
128
129
130
131
                const kubeflowClusterJsonObject: object = JSON.parse(value);
                this.kubeflowClusterConfig = KubeflowClusterConfigFactory.generateKubeflowClusterConfig(kubeflowClusterJsonObject);
                if (this.kubeflowClusterConfig.storageType === 'azureStorage') {
                    const azureKubeflowClusterConfig: KubeflowClusterConfigAzure = <KubeflowClusterConfigAzure>this.kubeflowClusterConfig;
                    this.azureStorageAccountName = azureKubeflowClusterConfig.azureStorage.accountName;
                    this.azureStorageShare = azureKubeflowClusterConfig.azureStorage.azureShare;
                    await this.createAzureStorage(
                        azureKubeflowClusterConfig.keyVault.vaultName,
chicm-ms's avatar
chicm-ms committed
132
                        azureKubeflowClusterConfig.keyVault.name
133
134
135
136
137
138
139
140
                    );
                } else if (this.kubeflowClusterConfig.storageType === 'nfs') {
                    const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig;
                    await this.createNFSStorage(
                        nfsKubeflowClusterConfig.nfs.server,
                        nfsKubeflowClusterConfig.nfs.path
                    );
                }
141
142
                this.kubernetesCRDClient = KubeflowOperatorClientFactory.createClient(
                    this.kubeflowClusterConfig.operator, this.kubeflowClusterConfig.apiVersion);
143
                break;
chicm-ms's avatar
chicm-ms committed
144
145
            }
            case TrialConfigMetadataKey.TRIAL_CONFIG: {
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
                if (this.kubeflowClusterConfig === undefined) {
                    this.log.error('kubeflow cluster config is not initialized');

                    return Promise.reject(new Error('kubeflow cluster config is not initialized'));
                }

                assert(this.kubeflowClusterConfig !== undefined);
                const kubeflowTrialJsonObjsect: object = JSON.parse(value);
                this.kubeflowTrialConfig = KubeflowTrialConfigFactory.generateKubeflowTrialConfig(
                    kubeflowTrialJsonObjsect,
                    this.kubeflowClusterConfig.operator
                );

                // Validate to make sure codeDir doesn't have too many files
                try {
                    await validateCodeDir(this.kubeflowTrialConfig.codeDir);
162
163
                    //upload codeDir to storage
                    this.copyExpCodeDirPromise = this.uploadFolder(this.kubeflowTrialConfig.codeDir, `nni/${getExperimentId()}/nni-code`);
164
165
166
167
168
169
                } catch (error) {
                    this.log.error(error);

                    return Promise.reject(new Error(error));
                }
                break;
chicm-ms's avatar
chicm-ms committed
170
            }
171
172
173
174
175
176
177
178
179
180
181
182
            case TrialConfigMetadataKey.VERSION_CHECK:
                this.versionCheck = (value === 'true' || value === 'True');
                break;
            case TrialConfigMetadataKey.LOG_COLLECTION:
                this.logCollection = value;
                break;
            default:
        }

        return Promise.resolve();
    }

183
    /**
184
     * upload local folder to nfs or azureStroage
185
     */
186
    private async uploadFolder(srcDirectory: string, destDirectory: string): Promise<string> {
187
        if (this.kubeflowClusterConfig === undefined) {
188
189
190
            throw new Error('Kubeflow Cluster config is not initialized');
        }

191
192
193
194
        if (this.kubeflowTrialConfig === undefined) {
            throw new Error('Kubeflow Trial config is not initialized');
        }

195
        assert(this.kubeflowClusterConfig.storage === undefined
196
            || this.kubeflowClusterConfig.storage === 'azureStorage'
197
198
            || this.kubeflowClusterConfig.storage === 'nfs');

199
200
201
202
        if (this.kubeflowClusterConfig.storage === 'azureStorage') {
            if (this.azureStorageClient === undefined) {
                throw new Error('azureStorageClient is not initialized');
            }
203
            const azureKubeflowClusterConfig: KubeflowClusterConfigAzure = <KubeflowClusterConfigAzure>this.kubeflowClusterConfig;
204
            return await this.uploadFolderToAzureStorage(srcDirectory, destDirectory, azureKubeflowClusterConfig.uploadRetryCount);
205
        } else if (this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) {
206
207
            await cpp.exec(`mkdir -p ${this.trialLocalTempFolder}/${destDirectory}`);
            await cpp.exec(`cp -r ${srcDirectory}/* ${this.trialLocalTempFolder}/${destDirectory}/.`);
208
            const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig;
209
            const nfsConfig: NFSConfig = nfsKubeflowClusterConfig.nfs;
210
            return `nfs://${nfsConfig.server}:${destDirectory}`;
211
        }
212
        return '';
213
    }
214

215
216
    private async prepareRunScript(trialLocalTempFolder: string, trialJobId: string, trialWorkingFolder: string,
                                   form: TrialJobApplicationForm): Promise<void> {
217
        if (this.kubeflowClusterConfig === undefined) {
218
219
220
221
            throw new Error('Kubeflow Cluster config is not initialized');
        }

        // initialize kubeflow trial config to specific type
222
223
        let kubeflowTrialConfig: any;
        if (this.kubeflowClusterConfig.operator === 'tf-operator') {
224
            kubeflowTrialConfig = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
225
        } else if (this.kubeflowClusterConfig.operator === 'pytorch-operator') {
226
            kubeflowTrialConfig = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
227
228
        } else {
            throw Error(`operator ${this.kubeflowClusterConfig.operator} is invalid`);
229
        }
230

231
        //create tmp trial working folder locally.
232
        await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
chicm-ms's avatar
chicm-ms committed
233
        const runScriptContent: string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
234
235
236
237
238
        // Write NNI installation file to local tmp files
        await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });

        // Write worker file content run_worker.sh to local tmp folders
        if (kubeflowTrialConfig.worker !== undefined) {
239
           const workerRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder,
240
                                                                               kubeflowTrialConfig.worker.command,
241
                                                                               form.sequenceId.toString(), 'worker',
242
                                                                               kubeflowTrialConfig.worker.gpuNum);
243
           await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_worker.sh'), workerRunScriptContent, { encoding: 'utf8' });
244
245
246
247
248
        }
        // Write parameter server file content run_ps.sh to local tmp folders
        if (this.kubeflowClusterConfig.operator === 'tf-operator') {
           const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
           if (tensorflowTrialConfig.ps !== undefined) {
249
               const psRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder,
250
                                                                               tensorflowTrialConfig.ps.command,
251
                                                                               form.sequenceId.toString(),
252
                                                                               'ps', tensorflowTrialConfig.ps.gpuNum);
253
254
               await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_ps.sh'), psRunScriptContent, { encoding: 'utf8' });
           }
255
256
257
        } else if (this.kubeflowClusterConfig.operator === 'pytorch-operator') {
           const pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
           if (pytorchTrialConfig.master !== undefined) {
258
               const masterRunScriptContent: string = await this.generateRunScript('kubeflow', trialJobId, trialWorkingFolder,
259
                                                                                   pytorchTrialConfig.master.command,
260
                                                                                   form.sequenceId.toString(), 'master',
261
                                                                                   pytorchTrialConfig.master.gpuNum);
262
263
               await fs.promises.writeFile(path.join(trialLocalTempFolder, 'run_master.sh'), masterRunScriptContent, { encoding: 'utf8' });
           }
264
265
        }
        // Write file content ( parameter.cfg ) to local tmp folders
266
267
268
        if (form !== undefined) {
           await fs.promises.writeFile(path.join(trialLocalTempFolder, generateParamFileName(form.hyperParameters)),
                                       form.hyperParameters.value, { encoding: 'utf8' });
269
        }
270
    }
271

272
    private async prepareKubeflowConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName: string): Promise<any> {
273
        if (this.kubeflowClusterConfig === undefined) {
274
275
276
            throw new Error('Kubeflow Cluster config is not initialized');
        }

277
        if (this.kubeflowTrialConfig === undefined) {
278
279
280
281
            throw new Error('Kubeflow trial config is not initialized');
        }

        // initialize kubeflow trial config to specific type
282
283
        let kubeflowTrialConfig: any;
        if (this.kubeflowClusterConfig.operator === 'tf-operator') {
284
            kubeflowTrialConfig = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
285
        } else if (this.kubeflowClusterConfig.operator === 'pytorch-operator') {
286
            kubeflowTrialConfig = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
287
288
        } else {
            throw Error(`operator ${this.kubeflowClusterConfig.operator} is invalid`);
289
        }
290

chicm-ms's avatar
chicm-ms committed
291
        const workerPodResources: any = {};
292
        if (kubeflowTrialConfig.worker !== undefined) {
293
            workerPodResources.requests = this.generatePodResource(kubeflowTrialConfig.worker.memoryMB, kubeflowTrialConfig.worker.cpuNum,
294
                                                                   kubeflowTrialConfig.worker.gpuNum);
295
        }
296
        workerPodResources.limits = {...workerPodResources.requests};
297

chicm-ms's avatar
chicm-ms committed
298
        const nonWorkerResources: any = {};
299
300
301
        if (this.kubeflowClusterConfig.operator === 'tf-operator') {
            const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
            if (tensorflowTrialConfig.ps !== undefined) {
302
                nonWorkerResources.requests = this.generatePodResource(tensorflowTrialConfig.ps.memoryMB, tensorflowTrialConfig.ps.cpuNum,
303
304
                                                                       tensorflowTrialConfig.ps.gpuNum);
                nonWorkerResources.limits = {...nonWorkerResources.requests};
305
            }
306
307
        } else if (this.kubeflowClusterConfig.operator === 'pytorch-operator') {
            const pyTorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
308
            nonWorkerResources.requests = this.generatePodResource(pyTorchTrialConfig.master.memoryMB, pyTorchTrialConfig.master.cpuNum,
309
310
                                                                   pyTorchTrialConfig.master.gpuNum);
            nonWorkerResources.limits = {...nonWorkerResources.requests};
311
312
313
        }

        // Generate kubeflow job resource config object
314
        const kubeflowJobConfig: any = await this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources,
315
                                                                      nonWorkerResources);
316
317

        return Promise.resolve(kubeflowJobConfig);
318
    }
319
320
321
322
323
324
325
326
327

    /**
     * Generate kubeflow resource config file
     * @param trialJobId trial job id
     * @param trialWorkingFolder working folder
     * @param kubeflowJobName job name
     * @param workerPodResources worker pod template
     * @param nonWorkerPodResources non-worker pod template, like ps or master
     */
chicm-ms's avatar
chicm-ms committed
328
329
    private async generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName: string, workerPodResources: any,
                                            nonWorkerPodResources?: any): Promise<any> {
330
        if (this.kubeflowClusterConfig === undefined) {
331
332
333
            throw new Error('Kubeflow Cluster config is not initialized');
        }

334
        if (this.kubeflowTrialConfig === undefined) {
335
336
337
            throw new Error('Kubeflow trial config is not initialized');
        }

338
        if (this.kubernetesCRDClient === undefined) {
339
340
341
342
            throw new Error('Kubeflow operator client is not initialized');
        }

        const replicaSpecsObj: any = {};
343
344
345
        const replicaSpecsObjMap: Map<string, object> = new Map<string, object>();
        if (this.kubeflowTrialConfig.operatorType === 'tf-operator') {
            const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
chicm-ms's avatar
chicm-ms committed
346
            const privateRegistrySecretName = await this.createRegistrySecret(tensorflowTrialConfig.worker.privateRegistryAuthPath);
347
            replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas,
348
                                                                tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources, privateRegistrySecretName);
349
            if (tensorflowTrialConfig.ps !== undefined) {
chicm-ms's avatar
chicm-ms committed
350
                const privateRegistrySecretName: string | undefined = await this.createRegistrySecret(tensorflowTrialConfig.ps.privateRegistryAuthPath);
351
                replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas,
352
                                                                tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources, privateRegistrySecretName);
353
            }
354
355
356
357
            replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {tfReplicaSpecs: replicaSpecsObj});
        } else if (this.kubeflowTrialConfig.operatorType === 'pytorch-operator') {
            const pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
            if (pytorchTrialConfig.worker !== undefined) {
chicm-ms's avatar
chicm-ms committed
358
                const privateRegistrySecretName: string | undefined = await this.createRegistrySecret(pytorchTrialConfig.worker.privateRegistryAuthPath);
359
                replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas,
360
                                                                    pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources, privateRegistrySecretName);
361
            }
chicm-ms's avatar
chicm-ms committed
362
            const privateRegistrySecretName: string | undefined = await this.createRegistrySecret(pytorchTrialConfig.master.privateRegistryAuthPath);
363
            replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas,
364
                                                                pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources, privateRegistrySecretName);
365

366
            replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {pytorchReplicaSpecs: replicaSpecsObj});
367
368
        }

369
        return Promise.resolve({
370
371
            apiVersion: `kubeflow.org/${this.kubernetesCRDClient.apiVersion}`,
            kind: this.kubernetesCRDClient.jobKind,
372
            metadata: {
373
374
375
376
377
378
379
380
381
                name: kubeflowJobName,
                namespace: 'default',
                labels: {
                    app: this.NNI_KUBERNETES_TRIAL_LABEL,
                    expId: getExperimentId(),
                    trialId: trialJobId
                }
            },
            spec: replicaSpecsObjMap.get(this.kubernetesCRDClient.jobKind)
382
        });
383
384
385
386
387
388
389
390
391
392
    }

    /**
     * Generate tf-operator's tfjobs replica config section
     * @param trialWorkingFolder trial working folder
     * @param replicaNumber replica number
     * @param replicaImage image
     * @param runScriptFile script file name
     * @param podResources pod resource config section
     */
393
    private generateReplicaConfig(trialWorkingFolder: string, replicaNumber: number, replicaImage: string, runScriptFile: string,
394
                                  podResources: any, privateRegistrySecretName: string | undefined): any {
395
        if (this.kubeflowClusterConfig === undefined) {
396
397
398
            throw new Error('Kubeflow Cluster config is not initialized');
        }

399
        if (this.kubeflowTrialConfig === undefined) {
400
401
402
            throw new Error('Kubeflow trial config is not initialized');
        }

403
        if (this.kubernetesCRDClient === undefined) {
404
405
            throw new Error('Kubeflow operator client is not initialized');
        }
406
        // The config spec for volume field
407
408
        const volumeSpecMap: Map<string, object> = new Map<string, object>();
        if (this.kubeflowClusterConfig.storageType === 'azureStorage') {
409
410
411
412
413
414
415
416
            volumeSpecMap.set('nniVolumes', [
            {
                    name: 'nni-vol',
                    azureFile: {
                        secretName: `${this.azureStorageSecretName}`,
                        shareName: `${this.azureStorageShare}`,
                        readonly: false
                    }
417
418
419
            }]);
        } else {
            const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS> this.kubeflowClusterConfig;
420
421
422
423
424
425
426
            volumeSpecMap.set('nniVolumes', [
            {
                name: 'nni-vol',
                nfs: {
                    server: `${nfsKubeflowClusterConfig.nfs.server}`,
                    path: `${nfsKubeflowClusterConfig.nfs.path}`
                }
427
            }]);
428
        }
429
        // The config spec for container field
430
        const containersSpecMap: Map<string, object> = new Map<string, object>();
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
        containersSpecMap.set('containers', [
        {
                // Kubeflow tensorflow operator requires that containers' name must be tensorflow
                // TODO: change the name based on operator's type
                name: this.kubernetesCRDClient.containerName,
                image: replicaImage,
                args: ['sh', `${path.join(trialWorkingFolder, runScriptFile)}`],
                volumeMounts: [
                {
                    name: 'nni-vol',
                    mountPath: this.CONTAINER_MOUNT_PATH
                }],
                resources: podResources
            }
        ]);
chicm-ms's avatar
chicm-ms committed
446
        const spec: any = {
447
448
449
450
451
452
453
454
455
456
            containers: containersSpecMap.get('containers'),
            restartPolicy: 'ExitCode',
            volumes: volumeSpecMap.get('nniVolumes')
        }
        if (privateRegistrySecretName) {
            spec.imagePullSecrets = [
                {
                    name: privateRegistrySecretName
                }]
        }
457
458
459
460
461
462
        return {
            replicas: replicaNumber,
            template: {
                metadata: {
                    creationTimestamp: null
                },
463
                spec: spec
464
            }
465
        }
466
    }
467
468
469
470

    public async updateTrialJob(_1: any, _2: any): Promise<TrialJobDetail> {
        throw new Error('not supported');
    }
471
}
472
export { KubeflowTrainingService };