nnimanager.ts 39.6 KB
Newer Older
liuzhe-lz's avatar
liuzhe-lz committed
1
2
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
Deshui Yu's avatar
Deshui Yu committed
3

4
import assert from 'assert';
chicm-ms's avatar
chicm-ms committed
5
import { ChildProcess, StdioOptions } from 'child_process';
Deshui Yu's avatar
Deshui Yu committed
6
7
8
import { Deferred } from 'ts-deferred';
import * as component from '../common/component';
import { DataStore, MetricDataRecord, MetricType, TrialJobInfo } from '../common/datastore';
9
import { NNIError } from '../common/errors';
10
import { getExperimentId, getDispatcherPipe } from '../common/experimentStartupInfo';
11
import { Logger, getLogger, stopLogging } from '../common/log';
Deshui Yu's avatar
Deshui Yu committed
12
import {
13
    ExperimentProfile, Manager, ExperimentStatus,
14
    NNIManagerStatus, ProfileUpdateType, TrialJobStatistics
Deshui Yu's avatar
Deshui Yu committed
15
} from '../common/manager';
16
import { ExperimentConfig, toSeconds, toCudaVisibleDevices } from '../common/experimentConfig';
17
import { ExperimentManager } from '../common/experimentManager';
J-shang's avatar
J-shang committed
18
import { TensorboardManager } from '../common/tensorboardManager';
Deshui Yu's avatar
Deshui Yu committed
19
import {
QuanluZhang's avatar
QuanluZhang committed
20
    TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus, TrialCommandContent, PlacementConstraint
Deshui Yu's avatar
Deshui Yu committed
21
} from '../common/trainingService';
22
import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP, getTunerProc, getLogLevel, isAlive, killPid } from '../common/utils';
Deshui Yu's avatar
Deshui Yu committed
23
import {
chicm-ms's avatar
chicm-ms committed
24
    INITIALIZE, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, PING,
25
    REPORT_METRIC_DATA, REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE, IMPORT_DATA
Deshui Yu's avatar
Deshui Yu committed
26
} from './commands';
27
import { createDispatcherInterface, createDispatcherPipeInterface, IpcInterface } from './ipcInterface';
28
import { NNIRestServer } from '../rest_server/nniRestServer';
Deshui Yu's avatar
Deshui Yu committed
29
30

/**
chicm-ms's avatar
chicm-ms committed
31
 * NNIManager which implements Manager interface
Deshui Yu's avatar
Deshui Yu committed
32
33
 */
class NNIManager implements Manager {
34
    private trainingService!: TrainingService;
35
    private dispatcher: IpcInterface | undefined;
36
    private experimentManager: ExperimentManager;
37
    private currSubmittedTrialNum: number;  // need to be recovered
QuanluZhang's avatar
QuanluZhang committed
38
    private trialConcurrencyChange: number; // >0: increase, <0: decrease
Deshui Yu's avatar
Deshui Yu committed
39
40
    private log: Logger;
    private dataStore: DataStore;
41
    private experimentProfile!: ExperimentProfile;
42
    private dispatcherPid: number;
43
    private status: NNIManagerStatus;
44
    private waitingTrials: TrialJobApplicationForm[];
QuanluZhang's avatar
QuanluZhang committed
45
    private trialJobs: Map<string, TrialJobDetail>;
46
    private trialDataForTuner: string;
SparkSnail's avatar
SparkSnail committed
47
    private readonly: boolean;
48
    private config!: ExperimentConfig;
49

50
    private trialJobMetricListener: (metric: TrialJobMetric) => void;
51

Deshui Yu's avatar
Deshui Yu committed
52
53
    constructor() {
        this.currSubmittedTrialNum = 0;
QuanluZhang's avatar
QuanluZhang committed
54
        this.trialConcurrencyChange = 0;
55
        this.experimentManager = component.get(ExperimentManager);
56
        this.dispatcherPid = 0;
QuanluZhang's avatar
QuanluZhang committed
57
58
        this.waitingTrials = [];
        this.trialJobs = new Map<string, TrialJobDetail>();
59
        this.trialDataForTuner = '';
SparkSnail's avatar
SparkSnail committed
60
        this.readonly = false;
Deshui Yu's avatar
Deshui Yu committed
61

liuzhe-lz's avatar
liuzhe-lz committed
62
        this.log = getLogger('NNIManager');
Deshui Yu's avatar
Deshui Yu committed
63
        this.dataStore = component.get(DataStore);
64
65
66
        this.status = {
            status: 'INITIALIZED',
            errors: []
Deshui Yu's avatar
Deshui Yu committed
67
        };
chicm-ms's avatar
chicm-ms committed
68
        this.trialJobMetricListener = (metric: TrialJobMetric): void => {
69
70
71
72
            this.onTrialJobMetrics(metric).catch((err: Error) => {
                this.criticalError(NNIError.FromError(err, 'Job metrics error: '));
            });
        };
73
74
75
76
77

        const pipe = getDispatcherPipe();
        if (pipe !== null) {
            this.dispatcher = createDispatcherPipeInterface(pipe);
        }
Deshui Yu's avatar
Deshui Yu committed
78
79
80
    }

    public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void> {
SparkSnail's avatar
SparkSnail committed
81
82
83
        if (this.readonly) {
            return Promise.reject(new Error('Error: can not update experiment profile in readonly mode!'));
        }
Deshui Yu's avatar
Deshui Yu committed
84
85
86
87
88
        switch (updateType) {
            case 'TRIAL_CONCURRENCY':
                this.updateTrialConcurrency(experimentProfile.params.trialConcurrency);
                break;
            case 'MAX_EXEC_DURATION':
89
                this.experimentProfile.params.maxExperimentDuration = experimentProfile.params.maxExperimentDuration;
Deshui Yu's avatar
Deshui Yu committed
90
91
92
93
                break;
            case 'SEARCH_SPACE':
                this.updateSearchSpace(experimentProfile.params.searchSpace);
                break;
QuanluZhang's avatar
QuanluZhang committed
94
            case 'MAX_TRIAL_NUM':
95
                this.experimentProfile.params.maxTrialNumber = experimentProfile.params.maxTrialNumber;
QuanluZhang's avatar
QuanluZhang committed
96
                break;
Deshui Yu's avatar
Deshui Yu committed
97
98
99
100
101
102
103
            default:
                throw new Error('Error: unrecognized updateType');
        }

        return this.storeExperimentProfile();
    }

104
    public importData(data: string): Promise<void> {
SparkSnail's avatar
SparkSnail committed
105
106
107
        if (this.readonly) {
            return Promise.reject(new Error('Error: can not import data in readonly mode!'));
        }
108
109
110
111
112
113
114
115
116
117
        if (this.dispatcher === undefined) {
            return Promise.reject(
                new Error('tuner has not been setup')
            );
        }
        this.dispatcher.sendCommand(IMPORT_DATA, data);

        return this.dataStore.storeTrialJobEvent('IMPORT_DATA', '', data);
    }

118
119
120
121
    public getImportedData(): Promise<string[]> {
        return this.dataStore.getImportedData();
    }

122
123
124
125
    public async exportData(): Promise<string> {
        return this.dataStore.exportTrialHpConfigs();
    }

126
    public addCustomizedTrialJob(hyperParams: string): Promise<number> {
SparkSnail's avatar
SparkSnail committed
127
128
129
        if (this.readonly) {
            return Promise.reject(new Error('Error: can not add customized trial job in readonly mode!'));
        }
130
        if (this.currSubmittedTrialNum >= this.maxTrialNum) {
131
            return Promise.reject(new Error('reach maxTrialNum'));
Deshui Yu's avatar
Deshui Yu committed
132
        }
133
134
135

        // TODO: NNI manager should not peek tuner's internal protocol, let's refactor this later
        const packedParameter = {
chicm-ms's avatar
chicm-ms committed
136
137
            parameter_id: null, // eslint-disable-line @typescript-eslint/camelcase
            parameter_source: 'customized', // eslint-disable-line @typescript-eslint/camelcase
138
139
140
141
142
143
144
145
146
147
148
            parameters: JSON.parse(hyperParams)
        }

        const form: TrialJobApplicationForm = {
            sequenceId: this.experimentProfile.nextSequenceId++,
            hyperParameters: {
                value: JSON.stringify(packedParameter),
                index: 0
            }
        };
        this.waitingTrials.push(form);
Deshui Yu's avatar
Deshui Yu committed
149
150

        // trial id has not been generated yet, thus use '' instead
151
152
153
        this.dataStore.storeTrialJobEvent('ADD_CUSTOMIZED', '', hyperParams);

        return Promise.resolve(form.sequenceId);
Deshui Yu's avatar
Deshui Yu committed
154
155
156
    }

    public async cancelTrialJobByUser(trialJobId: string): Promise<void> {
SparkSnail's avatar
SparkSnail committed
157
158
159
        if (this.readonly) {
            return Promise.reject(new Error('Error: can not cancel trial job in readonly mode!'));
        }
chicm-ms's avatar
chicm-ms committed
160
        this.log.info(`User cancelTrialJob: ${trialJobId}`);
Deshui Yu's avatar
Deshui Yu committed
161
162
163
164
        await this.trainingService.cancelTrialJob(trialJobId);
        await this.dataStore.storeTrialJobEvent('USER_TO_CANCEL', trialJobId, '');
    }

165
166
167
168
169
170
171
172
173
174
175
    public async startExperiment(config: ExperimentConfig): Promise<string> {
        this.experimentProfile = {
            params: config,
            id: getExperimentId(),
            execDuration: 0,
            logDir: getExperimentRootDir(),
            startTime: Date.now(),
            endTime: undefined,
            nextSequenceId: 0,
            revision: 0
        };
176
        this.config = config;
chicm-ms's avatar
chicm-ms committed
177
        this.log.info(`Starting experiment: ${this.experimentProfile.id}`);
Deshui Yu's avatar
Deshui Yu committed
178
        await this.storeExperimentProfile();
179

180
181
182
183
        if (this.trainingService === undefined) {
            this.log.info('Setup training service...');
            this.trainingService = await this.initTrainingService(config);
        }
184

185
186
        this.log.info('Setup tuner...');
        const dispatcherCommand: string = getMsgDispatcherCommand(config);
187
        this.log.debug(`dispatcher command: ${dispatcherCommand}`);
QuanluZhang's avatar
QuanluZhang committed
188
        const checkpointDir: string = await this.createCheckpointDir();
189
        this.setupTuner(dispatcherCommand, undefined, 'start', checkpointDir);
chicm-ms's avatar
chicm-ms committed
190
        this.setStatus('RUNNING');
Deshui Yu's avatar
Deshui Yu committed
191
        await this.storeExperimentProfile();
192
193
        this.run().catch((err: Error) => {
            this.criticalError(err);
Deshui Yu's avatar
Deshui Yu committed
194
        });
195

Deshui Yu's avatar
Deshui Yu committed
196
197
198
        return this.experimentProfile.id;
    }

SparkSnail's avatar
SparkSnail committed
199
    public async resumeExperiment(readonly: boolean): Promise<void> {
Deshui Yu's avatar
Deshui Yu committed
200
201
        //Fetch back the experiment profile
        const experimentId: string = getExperimentId();
202
        this.log.info(`Resuming experiment: ${experimentId}`);
Deshui Yu's avatar
Deshui Yu committed
203
        this.experimentProfile = await this.dataStore.getExperimentProfile(experimentId);
204

205
        const config: ExperimentConfig = this.experimentProfile.params;
SparkSnail's avatar
SparkSnail committed
206
207
208
209
210
        this.config = config;
        if (this.trainingService === undefined) {
            this.log.info('Setup training service...');
            this.trainingService = await this.initTrainingService(config);
        }
211

212
213
214
215
216
217
        this.readonly = readonly;
        if (readonly) {
            this.setStatus('VIEWED');
            return;
        }

218
219
        this.log.info('Setup tuner...');
        const dispatcherCommand: string = getMsgDispatcherCommand(config);
220
        this.log.debug(`dispatcher command: ${dispatcherCommand}`);
QuanluZhang's avatar
QuanluZhang committed
221
        const checkpointDir: string = await this.createCheckpointDir();
222
        this.setupTuner(dispatcherCommand, undefined, 'resume', checkpointDir);
Deshui Yu's avatar
Deshui Yu committed
223
224
225
226
227
228
229
230
231

        const allTrialJobs: TrialJobInfo[] = await this.dataStore.listTrialJobs();

        // Resume currSubmittedTrialNum
        this.currSubmittedTrialNum = allTrialJobs.length;

        // Check the final status for WAITING and RUNNING jobs
        await Promise.all(allTrialJobs
            .filter((job: TrialJobInfo) => job.status === 'WAITING' || job.status === 'RUNNING')
J-shang's avatar
J-shang committed
232
            .map((job: TrialJobInfo) => this.dataStore.storeTrialJobEvent('FAILED', job.trialJobId)));
Deshui Yu's avatar
Deshui Yu committed
233

234
235
236
        // Collect generated trials and imported trials
        const finishedTrialData: string = await this.exportData();
        const importedData: string[] = await this.dataStore.getImportedData();
chicm-ms's avatar
chicm-ms committed
237
        let trialData: Record<string, any>[] = JSON.parse(finishedTrialData);
238
239
        for (const oneImportedData of importedData) {
            // do not deduplicate
chicm-ms's avatar
chicm-ms committed
240
            trialData = trialData.concat(<Record<string, any>[]>JSON.parse(oneImportedData));
241
242
243
        }
        this.trialDataForTuner = JSON.stringify(trialData);

244
245
        if (this.experimentProfile.execDuration < this.maxDuration &&
            this.currSubmittedTrialNum < this.maxTrialNum &&
chicm-ms's avatar
chicm-ms committed
246
247
248
            this.experimentProfile.endTime) {
            delete this.experimentProfile.endTime;
        }
chicm-ms's avatar
chicm-ms committed
249
        this.setStatus('RUNNING');
250

Deshui Yu's avatar
Deshui Yu committed
251
        // TO DO: update database record for resume event
252
253
254
        this.run().catch((err: Error) => {
            this.criticalError(err);
        });
Deshui Yu's avatar
Deshui Yu committed
255
256
    }

257
258
    public getTrialJob(trialJobId: string): Promise<TrialJobInfo> {
        return this.dataStore.getTrialJob(trialJobId);
Deshui Yu's avatar
Deshui Yu committed
259
260
    }

liuzhe-lz's avatar
liuzhe-lz committed
261
    public async setClusterMetadata(key: string, value: string): Promise<void> {
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
        // Hack for supporting v2 config, need refactor
        if (this.trainingService === undefined) {
            this.log.info('Setup training service...');
            switch (key) {
                case 'kubeflow_config': {
                    const kubeflowModule = await import('../training_service/kubernetes/kubeflow/kubeflowTrainingService');
                    this.trainingService = new kubeflowModule.KubeflowTrainingService();
                    break;
                }
                case 'frameworkcontroller_config': {
                    const fcModule = await import('../training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService');
                    this.trainingService = new fcModule.FrameworkControllerTrainingService();
                    break;
                }
                case 'adl_config': {
                    const adlModule = await import('../training_service/kubernetes/adl/adlTrainingService');
                    this.trainingService = new adlModule.AdlTrainingService();
                    break;
                }
                default:
                    throw new Error("Setup training service failed.");
            }
liuzhe-lz's avatar
liuzhe-lz committed
284
        }
285
        await this.trainingService.setClusterMetadata(key, value);
Deshui Yu's avatar
Deshui Yu committed
286
287
    }

liuzhe-lz's avatar
liuzhe-lz committed
288
289
    public getClusterMetadata(key: string): Promise<string> {
        return this.trainingService.getClusterMetadata(key);
Deshui Yu's avatar
Deshui Yu committed
290
291
292
293
294
295
    }

    public async getTrialJobStatistics(): Promise<TrialJobStatistics[]> {
        return this.dataStore.getTrialJobStatistics();
    }

296
    public async stopExperiment(): Promise<void> {
297
298
299
300
301
        await this.stopExperimentTopHalf();
        await this.stopExperimentBottomHalf();
    }

    public async stopExperimentTopHalf(): Promise<void> {
chicm-ms's avatar
chicm-ms committed
302
303
        this.setStatus('STOPPING');
        this.log.info('Stopping experiment, cleaning up ...');
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325

        if (this.dispatcher === undefined) {
            this.log.error('Tuner has not been setup');
            return;
        }

        this.trainingService.removeTrialJobMetricListener(this.trialJobMetricListener);
        if (this.dispatcherPid > 0) {
            this.dispatcher.sendCommand(TERMINATE);
            // gracefully terminate tuner and assessor here, wait at most 30 seconds.
            for (let i: number = 0; i < 30; i++) {
                if (!await isAlive(this.dispatcherPid)) {
                    break;
                }
                await delay(1000);
            }
            await killPid(this.dispatcherPid);
        }
        this.dispatcher = undefined;
    }

    public async stopExperimentBottomHalf(): Promise<void> {
326
327
328
329
330
331
332
333
334
335
336
337
338
339
        try {
            const trialJobList: TrialJobDetail[] = await this.trainingService.listTrialJobs();

            // DON'T try to make it in parallel, the training service may not handle it well.
            // If there is performance concern, consider to support batch cancellation on training service.
            for (const trialJob of trialJobList) {
                if (trialJob.status === 'RUNNING' ||
                    trialJob.status === 'WAITING') {
                    try {
                        this.log.info(`cancelTrialJob: ${trialJob.id}`);
                        await this.trainingService.cancelTrialJob(trialJob.id);
                    } catch (error) {
                        this.log.debug(`ignorable error on canceling trial ${trialJob.id}. ${error}`);
                    }
340
341
                }
            }
342
343
344
            await this.trainingService.cleanUp();
        } catch (err) {
            this.log.error(`${err.stack}`);
345
346
347
348
349
350
        }
        if (this.experimentProfile.endTime === undefined) {
            this.setEndtime();
        }
        await this.storeExperimentProfile();
        this.setStatus('STOPPED');
chicm-ms's avatar
chicm-ms committed
351
        this.log.info('Experiment stopped.');
352
353
354

        let hasError: boolean = false;
        try {
liuzhe-lz's avatar
liuzhe-lz committed
355
            await this.experimentManager.stop();
J-shang's avatar
J-shang committed
356
            await component.get<TensorboardManager>(TensorboardManager).stop();
liuzhe-lz's avatar
liuzhe-lz committed
357
            await this.dataStore.close();
358
359
360
361
362
            await component.get<NNIRestServer>(NNIRestServer).stop();
        } catch (err) {
            hasError = true;
            this.log.error(`${err.stack}`);
        } finally {
363
            stopLogging();
364
365
            process.exit(hasError ? 1 : 0);
        }
Deshui Yu's avatar
Deshui Yu committed
366
367
    }

368
    public async getMetricData(trialJobId?: string, metricType?: MetricType): Promise<MetricDataRecord[]> {
Deshui Yu's avatar
Deshui Yu committed
369
370
371
        return this.dataStore.getMetricData(trialJobId, metricType);
    }

372
373
374
375
376
377
    public async getMetricDataByRange(minSeqId: number, maxSeqId: number): Promise<MetricDataRecord[]> {
        const trialJobs = await this.dataStore.listTrialJobs();
        const targetTrials = trialJobs.filter(trial => (
            // FIXME: can this be undefined?
            trial.sequenceId !== undefined && minSeqId <= trial.sequenceId && trial.sequenceId <= maxSeqId
        ));
J-shang's avatar
J-shang committed
378
        const targetTrialIds = new Set(targetTrials.map(trial => trial.trialJobId));
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402

        const allMetrics = await this.dataStore.getMetricData();
        return allMetrics.filter(metric => targetTrialIds.has(metric.trialJobId));
    }

    public async getLatestMetricData(): Promise<MetricDataRecord[]> {
        // FIXME: this can take a long time
        const allMetrics: MetricDataRecord[] = await this.dataStore.getMetricData();
        const finals: MetricDataRecord[] = [];
        const latestIntermediates: Map<string, MetricDataRecord> = new Map<string, MetricDataRecord>();
        for (const metric of allMetrics) {
            if (metric.type !== 'PERIODICAL') {
                finals.push(metric);
            } else {
                const old: MetricDataRecord | undefined = latestIntermediates.get(metric.trialJobId);
                if (old === undefined || old.sequence <= metric.sequence) {
                    latestIntermediates.set(metric.trialJobId, metric);
                }
            }
        }
        return finals.concat(Array.from(latestIntermediates.values()));
        // FIXME: unit test
    }

Yuge Zhang's avatar
Yuge Zhang committed
403
404
    public async getTrialFile(trialJobId: string, fileName: string): Promise<Buffer | string> {
        return this.trainingService.getTrialFile(trialJobId, fileName);
405
406
    }

Deshui Yu's avatar
Deshui Yu committed
407
408
409
410
411
412
413
414
    public getExperimentProfile(): Promise<ExperimentProfile> {
        // TO DO: using Promise.resolve()
        const deferred: Deferred<ExperimentProfile> = new Deferred<ExperimentProfile>();
        deferred.resolve(this.experimentProfile);

        return deferred.promise;
    }

415
416
417
418
    public getStatus(): NNIManagerStatus {
        return this.status;
    }

Deshui Yu's avatar
Deshui Yu committed
419
420
421
422
    public async listTrialJobs(status?: TrialJobStatus): Promise<TrialJobInfo[]> {
        return this.dataStore.listTrialJobs(status);
    }

423
424
425
426
427
428
429
430
431
432
    private get maxDuration(): number {
        const value = this.experimentProfile.params.maxExperimentDuration;
        return (value === undefined ? Infinity : toSeconds(value));
    }

    private get maxTrialNum(): number {
        const value = this.experimentProfile.params.maxTrialNumber;
        return (value === undefined ? Infinity : value);
    }

Ni Hao's avatar
Ni Hao committed
433
434
435
436
437
    private get maxTrialDuration(): number {
        const value = this.experimentProfile.params.maxTrialDuration;
        return (value === undefined ? Infinity : toSeconds(value));
    }

438
    private async initTrainingService(config: ExperimentConfig): Promise<TrainingService> {
liuzhe-lz's avatar
liuzhe-lz committed
439
440
441
442
443
444
445
446
447
448
449
        let platform: string;
        if (Array.isArray(config.trainingService)) {
            platform = 'hybrid';
        } else if (config.trainingService.platform) {
            platform = config.trainingService.platform;
        } else {
            platform = (config as any).trainingServicePlatform;
        }
        if (!platform) {
            throw new Error('Cannot detect training service platform');
        }
450
        const reuseMode = Array.isArray(config.trainingService) || (config.trainingService as any).reuseMode;
451

452
453
454
455
        if (reuseMode) {
            const module_ = await import('../training_service/reusable/routerTrainingService');
            return await module_.RouterTrainingService.construct(config);
        } else if (platform === 'local') {
456
457
            const module_ = await import('../training_service/local/localTrainingService');
            return new module_.LocalTrainingService(config);
458
459
460
        } else if (platform === 'kubeflow') {
            const module_ = await import('../training_service/kubernetes/kubeflow/kubeflowTrainingService');
            return new module_.KubeflowTrainingService();
461
462
463
464
465
466
        } else if (platform === 'frameworkcontroller') {
            const module_ = await import('../training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService');
            return new module_.FrameworkControllerTrainingService();
        } else if (platform === 'adl') {
            const module_ = await import('../training_service/kubernetes/adl/adlTrainingService');
            return new module_.AdlTrainingService();
467
468
469
        } else {
            const module_ = await import('../training_service/reusable/routerTrainingService');
            return await module_.RouterTrainingService.construct(config);
470
471
472
        }
    }

473
474
    private setupTuner(command: string, cwd: string | undefined, mode: 'start' | 'resume', dataDirectory: string): void {
        if (this.dispatcher !== undefined) {
Deshui Yu's avatar
Deshui Yu committed
475
476
            return;
        }
goooxu's avatar
goooxu committed
477
        const stdio: StdioOptions = ['ignore', process.stdout, process.stderr, 'pipe', 'pipe'];
Deshui Yu's avatar
Deshui Yu committed
478
479
480
481
482
483
484
        let newCwd: string;
        if (cwd === undefined || cwd === '') {
            newCwd = getLogDir();
        } else {
            newCwd = cwd;
        }
        // TO DO: add CUDA_VISIBLE_DEVICES
485
        const includeIntermediateResultsEnv = !!(this.config.deprecated && this.config.deprecated.includeIntermediateResults);
486

chicm-ms's avatar
chicm-ms committed
487
        const nniEnv = {
chicm-ms's avatar
chicm-ms committed
488
            SDK_PROCESS: 'dispatcher',
Zejun Lin's avatar
Zejun Lin committed
489
490
            NNI_MODE: mode,
            NNI_CHECKPOINT_DIRECTORY: dataDirectory,
491
            NNI_LOG_DIRECTORY: getLogDir(),
492
            NNI_LOG_LEVEL: getLogLevel(),
493
            NNI_INCLUDE_INTERMEDIATE_RESULTS: includeIntermediateResultsEnv,
494
            CUDA_VISIBLE_DEVICES: toCudaVisibleDevices(this.experimentProfile.params.tunerGpuIndices)
Zejun Lin's avatar
Zejun Lin committed
495
        };
chicm-ms's avatar
chicm-ms committed
496
        const newEnv = Object.assign({}, process.env, nniEnv);
497
        const tunerProc: ChildProcess = getTunerProc(command, stdio, newCwd, newEnv);
498
        this.dispatcherPid = tunerProc.pid!;
499
        this.dispatcher = createDispatcherInterface(tunerProc);
Deshui Yu's avatar
Deshui Yu committed
500
501
502
503
504

        return;
    }

    private updateTrialConcurrency(trialConcurrency: number): void {
QuanluZhang's avatar
QuanluZhang committed
505
506
        // we assume trialConcurrency >= 0, which is checked by restserver
        this.trialConcurrencyChange += (trialConcurrency - this.experimentProfile.params.trialConcurrency);
Deshui Yu's avatar
Deshui Yu committed
507
508
509
510
511
512
        this.experimentProfile.params.trialConcurrency = trialConcurrency;

        return;
    }

    private updateSearchSpace(searchSpace: string): void {
513
        if (this.dispatcher === undefined) {
Deshui Yu's avatar
Deshui Yu committed
514
515
            throw new Error('Error: tuner has not been setup');
        }
516
        this.log.info(`Updated search space ${searchSpace}`);
517
        this.dispatcher.sendCommand(UPDATE_SEARCH_SPACE, searchSpace);
518
        this.experimentProfile.params.searchSpace = JSON.parse(searchSpace);
Deshui Yu's avatar
Deshui Yu committed
519
520
521
522
523

        return;
    }

    private async periodicallyUpdateExecDuration(): Promise<void> {
524
        let count: number = 1;
525
        while (!['ERROR', 'STOPPING', 'STOPPED'].includes(this.status.status)) {
526
            await delay(1000 * 1); // 1 seconds
527
            if (['RUNNING', 'NO_MORE_TRIAL', 'TUNER_NO_MORE_TRIAL'].includes(this.status.status)) {
528
529
530
531
532
533
                this.experimentProfile.execDuration += 1;
                if (count % 10 === 0) {
                    await this.storeExperimentProfile();
                }
            }
            count += 1;
Deshui Yu's avatar
Deshui Yu committed
534
535
536
        }
    }

chicm-ms's avatar
chicm-ms committed
537
538
539
540
541
542
    private async pingDispatcher(): Promise<void> {
        if (this.dispatcher === undefined) {
            throw new Error('Error: tuner has not been setup');
        }
        while (!['ERROR', 'STOPPING', 'STOPPED'].includes(this.status.status)) {
            this.dispatcher.sendCommand(PING);
chicm-ms's avatar
chicm-ms committed
543
            await delay(1000 * 5);
chicm-ms's avatar
chicm-ms committed
544
545
546
        }
    }

547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
    private async stopTrialIfOverMaxDurationLimit(): Promise<void> {
        if(this.maxTrialDuration === Infinity){
            return;
        }

        for (const trialJobId of Array.from(this.trialJobs.keys())) {
            const trialJobDetail: TrialJobDetail | undefined = this.trialJobs.get(trialJobId);
            if(undefined !== trialJobDetail &&
                trialJobDetail.status === 'RUNNING' &&
                trialJobDetail.startTime !== undefined){
                const currentTrialDuration = (new Date().getTime() - trialJobDetail.startTime) / 1000;
                if(currentTrialDuration>this.maxTrialDuration) {
                    const isEarlyStopped = true;
                    await this.trainingService.cancelTrialJob(trialJobId, isEarlyStopped);
                    this.log.info(`Trial job ${trialJobDetail.id} has been canceled because it is over max trial duration.`);
                }
            }
564
        }
Ni Hao's avatar
Ni Hao committed
565
566
    }

QuanluZhang's avatar
QuanluZhang committed
567
568
    private async requestTrialJobsStatus(): Promise<number> {
        let finishedTrialJobNum: number = 0;
QuanluZhang's avatar
QuanluZhang committed
569
570
571
        if (this.dispatcher === undefined) {
            throw new Error('Error: tuner has not been setup');
        }
QuanluZhang's avatar
QuanluZhang committed
572
573
574
575
        for (const trialJobId of Array.from(this.trialJobs.keys())) {
            const trialJobDetail: TrialJobDetail = await this.trainingService.getTrialJob(trialJobId);
            const oldTrialJobDetail: TrialJobDetail | undefined = this.trialJobs.get(trialJobId);
            if (oldTrialJobDetail !== undefined && oldTrialJobDetail.status !== trialJobDetail.status) {
chicm-ms's avatar
chicm-ms committed
576
                this.log.info(`Trial job ${trialJobDetail.id} status changed from ${oldTrialJobDetail.status} to ${trialJobDetail.status}`);
QuanluZhang's avatar
QuanluZhang committed
577
                this.trialJobs.set(trialJobId, Object.assign({}, trialJobDetail));
578
                await this.dataStore.storeTrialJobEvent(trialJobDetail.status, trialJobDetail.id, undefined, trialJobDetail);
QuanluZhang's avatar
QuanluZhang committed
579
            }
580
581
582
583
            const newTrialJobDetail: TrialJobDetail | undefined = this.trialJobs.get(trialJobId);
            if (newTrialJobDetail !== undefined) {
                newTrialJobDetail.message = trialJobDetail.message;
            }
QuanluZhang's avatar
QuanluZhang committed
584
            let hyperParams: string | undefined = undefined;
QuanluZhang's avatar
QuanluZhang committed
585
586
587
            switch (trialJobDetail.status) {
                case 'SUCCEEDED':
                case 'USER_CANCELED':
QuanluZhang's avatar
QuanluZhang committed
588
                case 'EARLY_STOPPED':
QuanluZhang's avatar
QuanluZhang committed
589
590
                    this.trialJobs.delete(trialJobId);
                    finishedTrialJobNum++;
591
                    hyperParams = trialJobDetail.form.hyperParameters.value;
QuanluZhang's avatar
QuanluZhang committed
592
                    this.dispatcher.sendCommand(TRIAL_END, JSON.stringify({
chicm-ms's avatar
chicm-ms committed
593
                        trial_job_id: trialJobDetail.id, // eslint-disable-line @typescript-eslint/camelcase
QuanluZhang's avatar
QuanluZhang committed
594
                        event: trialJobDetail.status,
chicm-ms's avatar
chicm-ms committed
595
                        hyper_params: hyperParams // eslint-disable-line @typescript-eslint/camelcase
goooxu's avatar
goooxu committed
596
                    }));
QuanluZhang's avatar
QuanluZhang committed
597
598
599
600
601
602
603
                    break;
                case 'FAILED':
                case 'SYS_CANCELED':
                    // In the current version, we do not retry
                    // TO DO: push this job to queue for retry
                    this.trialJobs.delete(trialJobId);
                    finishedTrialJobNum++;
604
                    hyperParams = trialJobDetail.form.hyperParameters.value;
QuanluZhang's avatar
QuanluZhang committed
605
                    this.dispatcher.sendCommand(TRIAL_END, JSON.stringify({
chicm-ms's avatar
chicm-ms committed
606
                        trial_job_id: trialJobDetail.id, // eslint-disable-line @typescript-eslint/camelcase
QuanluZhang's avatar
QuanluZhang committed
607
                        event: trialJobDetail.status,
chicm-ms's avatar
chicm-ms committed
608
                        hyper_params: hyperParams // eslint-disable-line @typescript-eslint/camelcase
goooxu's avatar
goooxu committed
609
                    }));
QuanluZhang's avatar
QuanluZhang committed
610
611
612
613
614
615
616
617
618
619
                    break;
                case 'WAITING':
                case 'RUNNING':
                case 'UNKNOWN':
                    // Do nothing
                    break;
                default:
                // TO DO: add warning in log
            }
        }
goooxu's avatar
goooxu committed
620

Gems Guo's avatar
Gems Guo committed
621
        return finishedTrialJobNum;
QuanluZhang's avatar
QuanluZhang committed
622
623
624
625
626
627
    }

    private async manageTrials(): Promise<void> {
        if (this.dispatcher === undefined) {
            throw new Error('Error: tuner has not been setup');
        }
QuanluZhang's avatar
QuanluZhang committed
628
        let allFinishedTrialJobNum: number = this.currSubmittedTrialNum;
QuanluZhang's avatar
QuanluZhang committed
629
        let waitSubmittedToFinish: number;
630
        while (!['ERROR', 'STOPPING', 'STOPPED'].includes(this.status.status)) {
631
632
            await this.stopTrialIfOverMaxDurationLimit();

QuanluZhang's avatar
QuanluZhang committed
633
634
635
636
637
638
639
            const finishedTrialJobNum: number = await this.requestTrialJobsStatus();
            allFinishedTrialJobNum += finishedTrialJobNum;

            // requestTrialNum is the number of trials that will be requested from tuner.
            // If trialConcurrency does not change, requestTrialNum equals finishedTrialJobNum.
            // If trialConcurrency changes, for example, trialConcurrency increases by 2 (trialConcurrencyChange=2), then
            // requestTrialNum equals 2 + finishedTrialJobNum and trialConcurrencyChange becomes 0.
640
            // If trialConcurrency changes, for example, trialConcurrency decreases by 4 (trialConcurrencyChange=-4) and
QuanluZhang's avatar
QuanluZhang committed
641
642
643
644
645
646
647
648
            // finishedTrialJobNum is 2, then requestTrialNum becomes -2. No trial will be requested from tuner,
            // and trialConcurrencyChange becomes -2.
            const requestTrialNum: number = this.trialConcurrencyChange + finishedTrialJobNum;
            if (requestTrialNum >= 0) {
                this.trialConcurrencyChange = 0;
            } else {
                this.trialConcurrencyChange = requestTrialNum;
            }
chicm-ms's avatar
chicm-ms committed
649

QuanluZhang's avatar
QuanluZhang committed
650
            // check maxtrialnum and maxduration here
651
            // NO_MORE_TRIAL is more like a subset of RUNNING, because during RUNNING tuner
652
            // might tell nnimanager that this is no more trials. In NO_MORE_TRIAL state, the experiment is viewed
653
654
            // as still running. DONE could be transfered from RUNNING or NO_MORE_TRIAL.
            assert(this.status.status === 'RUNNING' ||
655
                this.status.status === 'DONE' ||
QuanluZhang's avatar
QuanluZhang committed
656
                this.status.status === 'NO_MORE_TRIAL' ||
657
                this.status.status === 'TUNER_NO_MORE_TRIAL', `Actual status: ${this.status.status}`);
658
659
            if (this.experimentProfile.execDuration > this.maxDuration ||
                this.currSubmittedTrialNum >= this.maxTrialNum) {
QuanluZhang's avatar
QuanluZhang committed
660
                if (this.status.status !== 'DONE') {
chicm-ms's avatar
chicm-ms committed
661
                    this.setStatus('NO_MORE_TRIAL');
QuanluZhang's avatar
QuanluZhang committed
662
663
664
665
                    waitSubmittedToFinish = this.currSubmittedTrialNum;

                    assert(allFinishedTrialJobNum <= waitSubmittedToFinish);
                    if (allFinishedTrialJobNum >= waitSubmittedToFinish) {
chicm-ms's avatar
chicm-ms committed
666
                        this.setStatus('DONE');
667
                        this.setEndtime();
QuanluZhang's avatar
QuanluZhang committed
668
669
670
671
                        await this.storeExperimentProfile();
                        // write this log for travis CI
                        this.log.info('Experiment done.');
                    }
QuanluZhang's avatar
QuanluZhang committed
672
673
                }
            } else {
QuanluZhang's avatar
QuanluZhang committed
674
675
                this.requestTrialJobs(requestTrialNum);

QuanluZhang's avatar
QuanluZhang committed
676
                if (this.status.status === 'DONE') {
677
678
                    delete this.experimentProfile.endTime;
                    await this.storeExperimentProfile();
QuanluZhang's avatar
QuanluZhang committed
679
                }
QuanluZhang's avatar
QuanluZhang committed
680
                if (this.status.status !== 'TUNER_NO_MORE_TRIAL') {
chicm-ms's avatar
chicm-ms committed
681
                    this.setStatus('RUNNING');
682
                }
QuanluZhang's avatar
QuanluZhang committed
683
684
                for (let i: number = this.trialJobs.size; i < this.experimentProfile.params.trialConcurrency; i++) {
                    if (this.waitingTrials.length === 0 ||
685
                        this.currSubmittedTrialNum >= this.maxTrialNum) {
QuanluZhang's avatar
QuanluZhang committed
686
687
                        break;
                    }
688
                    const form = this.waitingTrials.shift() as TrialJobApplicationForm;
QuanluZhang's avatar
QuanluZhang committed
689
                    this.currSubmittedTrialNum++;
liuzhe-lz's avatar
liuzhe-lz committed
690
                    this.log.info('submitTrialJob: form:', form);
691
                    const trialJobDetail: TrialJobDetail = await this.trainingService.submitTrialJob(form);
692
                    const Snapshot: TrialJobDetail = Object.assign({}, trialJobDetail);
693
                    await this.storeExperimentProfile();
694
                    this.trialJobs.set(trialJobDetail.id, Snapshot);
QuanluZhang's avatar
QuanluZhang committed
695
696
697
                    const trialJobDetailSnapshot: TrialJobDetail | undefined = this.trialJobs.get(trialJobDetail.id);
                    if (trialJobDetailSnapshot != undefined) {
                        await this.dataStore.storeTrialJobEvent(
698
                            trialJobDetailSnapshot.status, trialJobDetailSnapshot.id, form.hyperParameters.value, trialJobDetailSnapshot);
QuanluZhang's avatar
QuanluZhang committed
699
700
701
702
703
704
705
706
707
                    } else {
                        assert(false, `undefined trialJobDetail in trialJobs: ${trialJobDetail.id}`);
                    }
                }
            }
            await delay(1000 * 5); // 5 seconds
        }
    }

Deshui Yu's avatar
Deshui Yu committed
708
709
710
711
712
713
    private storeExperimentProfile(): Promise<void> {
        this.experimentProfile.revision += 1;

        return this.dataStore.storeExperimentProfile(this.experimentProfile);
    }

714
    private async run(): Promise<void> {
QuanluZhang's avatar
QuanluZhang committed
715
        assert(this.dispatcher !== undefined);
716
717
718
719
720
721
722

        this.addEventListeners();

        this.sendInitTunerCommands();

        await Promise.all([
            this.periodicallyUpdateExecDuration(),
chicm-ms's avatar
chicm-ms committed
723
            this.pingDispatcher().catch((err: Error) => {
chicm-ms's avatar
chicm-ms committed
724
                throw NNIError.FromError(err, 'Dispatcher error: ');
chicm-ms's avatar
chicm-ms committed
725
            }),
726
            this.trainingService.run().catch((err: Error) => {
chicm-ms's avatar
chicm-ms committed
727
                throw NNIError.FromError(err, 'Training service error: ');
728
            }),
QuanluZhang's avatar
QuanluZhang committed
729
            this.manageTrials().catch((err: Error) => {
chicm-ms's avatar
chicm-ms committed
730
                throw NNIError.FromError(err, 'Job management error: ');
731
            })]);
732
733
    }

QuanluZhang's avatar
QuanluZhang committed
734
    private addEventListeners(): void {
chicm-ms's avatar
chicm-ms committed
735
        this.log.info('Add event listeners');
736
        // TO DO: cannot run this method more than once in one NNIManager instance
QuanluZhang's avatar
QuanluZhang committed
737
        if (this.dispatcher === undefined) {
738
739
            throw new Error('Error: tuner or job maintainer have not been setup');
        }
740
        this.trainingService.addTrialJobMetricListener(this.trialJobMetricListener);
741
742
743

        this.dispatcher.onCommand((commandType: string, content: string) => {
            this.onTunerCommand(commandType, content).catch((err: Error) => {
chicm-ms's avatar
chicm-ms committed
744
                this.criticalError(NNIError.FromError(err, 'Tuner command event error: '));
745
746
            });
        });
747
748
749
750
        this.dispatcher.onError((error: Error) => {
            this.log.error(`Dispatcher error: ${error.message}`);
            this.criticalError(new Error('Dispatcher stream error, tuner may have crashed.'));
        });
751
752
753
754
    }

    private sendInitTunerCommands(): void {
        if (this.dispatcher === undefined) {
755
            throw new Error('Dispatcher error: tuner has not been setup');
756
        }
chicm-ms's avatar
chicm-ms committed
757
758
        this.log.debug(`Send tuner command: INITIALIZE: ${this.experimentProfile.params.searchSpace}`);
        // Tuner need to be initialized with search space before generating any hyper parameters
759
        this.dispatcher.sendCommand(INITIALIZE, JSON.stringify(this.experimentProfile.params.searchSpace));
760
761
762
    }

    private async onTrialJobMetrics(metric: TrialJobMetric): Promise<void> {
liuzhe-lz's avatar
liuzhe-lz committed
763
        this.log.debug('NNIManager received trial job metrics:', metric);
764
        if (this.trialJobs.has(metric.id)) {
765
766
767
768
769
770
            await this.dataStore.storeMetricData(metric.id, metric.data);
            if (this.dispatcher === undefined) {
                throw new Error('Error: tuner has not been setup');
            }
            this.dispatcher.sendCommand(REPORT_METRIC_DATA, metric.data);
        } else {
liuzhe-lz's avatar
liuzhe-lz committed
771
            this.log.warning('NNIManager received non-existent trial job metrics:', metric);
772
773
774
        }
    }

chicm-ms's avatar
chicm-ms committed
775
776
777
778
779
780
781
    private requestTrialJobs(jobNum: number): void {
        if (jobNum < 1) {
            return;
        }
        if (this.dispatcher === undefined) {
            throw new Error('Dispatcher error: tuner has not been setup');
        }
782
        if (this.config.deprecated && this.config.deprecated.multiThread) {
chicm-ms's avatar
chicm-ms committed
783
784
785
786
787
788
789
790
791
792
793
            // Send multiple requests to ensure multiple hyper parameters are generated in non-blocking way.
            // For a single REQUEST_TRIAL_JOBS request, hyper parameters are generated one by one
            // sequentially.
            for (let i: number = 0; i < jobNum; i++) {
                this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, '1');
            }
        } else {
            this.dispatcher.sendCommand(REQUEST_TRIAL_JOBS, String(jobNum));
        }
    }

794
    private async onTunerCommand(commandType: string, content: string): Promise<void> {
horizon365's avatar
horizon365 committed
795
        this.log.info(`NNIManager received command from dispatcher: ${commandType}, ${content}`);
796
        switch (commandType) {
chicm-ms's avatar
chicm-ms committed
797
            case INITIALIZED: {
chicm-ms's avatar
chicm-ms committed
798
                // Tuner is intialized, search space is set, request tuner to generate hyper parameters
799
800
801
802
803
804
                if (this.trialDataForTuner.length > 0) {
                    if (this.dispatcher === undefined) {
                        throw new Error('Dispatcher error: tuner has not been setup');
                    }
                    this.dispatcher.sendCommand(IMPORT_DATA, this.trialDataForTuner);
                }
chicm-ms's avatar
chicm-ms committed
805
806
                this.requestTrialJobs(this.experimentProfile.params.trialConcurrency);
                break;
chicm-ms's avatar
chicm-ms committed
807
808
            }
            case NEW_TRIAL_JOB: {
QuanluZhang's avatar
QuanluZhang committed
809
                if (this.status.status === 'TUNER_NO_MORE_TRIAL') {
810
                    this.log.warning('It is not supposed to receive more trials after NO_MORE_TRIAL is set');
chicm-ms's avatar
chicm-ms committed
811
                    this.setStatus('RUNNING');
812
                }
813
                const trialRequestContent: TrialCommandContent = JSON.parse(content);
QuanluZhang's avatar
QuanluZhang committed
814
                const noneConstraint: PlacementConstraint = {type: 'None', gpus: []};
815
816
817
818
819
                const form: TrialJobApplicationForm = {
                    sequenceId: this.experimentProfile.nextSequenceId++,
                    hyperParameters: {
                        value: content,
                        index: 0
820
                    },
QuanluZhang's avatar
QuanluZhang committed
821
                    placementConstraint: trialRequestContent.placement_constraint? trialRequestContent.placement_constraint : noneConstraint
822
823
                };
                this.waitingTrials.push(form);
824
                break;
chicm-ms's avatar
chicm-ms committed
825
826
            }
            case SEND_TRIAL_JOB_PARAMETER: {
chicm-ms's avatar
chicm-ms committed
827
828
829
830
831
                const tunerCommand: any = JSON.parse(content);
                assert(tunerCommand.parameter_index >= 0);
                assert(tunerCommand.trial_job_id !== undefined);

                const trialJobForm: TrialJobApplicationForm = {
832
                    sequenceId: -1,  // FIXME: multi-phase tuner should use sequence ID instead of trial job ID
chicm-ms's avatar
chicm-ms committed
833
834
835
836
837
                    hyperParameters: {
                        value: content,
                        index: tunerCommand.parameter_index
                    }
                };
liuzhe-lz's avatar
liuzhe-lz committed
838
                this.log.info('updateTrialJob: job id:', tunerCommand.trial_job_id, 'form:', trialJobForm);
chicm-ms's avatar
chicm-ms committed
839
                await this.trainingService.updateTrialJob(tunerCommand.trial_job_id, trialJobForm);
840
841
842
843
844
                if (tunerCommand['parameters'] !== null) {
                    // parameters field is set as empty string if no more hyper parameter can be generated by tuner.
                    await this.dataStore.storeTrialJobEvent(
                        'ADD_HYPERPARAMETER', tunerCommand.trial_job_id, content, undefined);
                }
chicm-ms's avatar
chicm-ms committed
845
                break;
chicm-ms's avatar
chicm-ms committed
846
847
            }
            case NO_MORE_TRIAL_JOBS: {
848
849
850
                if (!['ERROR', 'STOPPING', 'STOPPED'].includes(this.status.status)) {
                    this.setStatus('TUNER_NO_MORE_TRIAL');
                }
851
                break;
chicm-ms's avatar
chicm-ms committed
852
853
            }
            case KILL_TRIAL_JOB: {
liuzhe-lz's avatar
liuzhe-lz committed
854
                this.log.info('cancelTrialJob:', content);
QuanluZhang's avatar
QuanluZhang committed
855
                await this.trainingService.cancelTrialJob(JSON.parse(content), true);
856
                break;
chicm-ms's avatar
chicm-ms committed
857
            }
858
859
860
            default:
                throw new Error('Error: unsupported command type from tuner');
        }
Deshui Yu's avatar
Deshui Yu committed
861
862
    }

863
864
865
866
867
868
869
870
871
872
    private criticalError(err: Error): void {
        this.logError(err);
        console.error(err);
    }

    private logError(err: Error): void {
        if (err.stack !== undefined) {
            this.log.error(err.stack);
        }
        this.status.errors.push(err.message);
873
        this.setEndtime();
chicm-ms's avatar
chicm-ms committed
874
875
876
877
878
879
880
        this.setStatus('ERROR');
    }

    private setStatus(status: ExperimentStatus): void {
        if (status !== this.status.status) {
            this.log.info(`Change NNIManager status from: ${this.status.status} to: ${status}`);
            this.status.status = status;
881
            this.experimentManager.setExperimentInfo(this.experimentProfile.id, 'status', this.status.status);
chicm-ms's avatar
chicm-ms committed
882
        }
883
884
    }

885
886
887
888
889
    private setEndtime(): void {
        this.experimentProfile.endTime = Date.now();
        this.experimentManager.setExperimentInfo(this.experimentProfile.id, 'endTime', this.experimentProfile.endTime);
    }

QuanluZhang's avatar
QuanluZhang committed
890
891
892
893
    private async createCheckpointDir(): Promise<string> {
        // TODO: test
        const chkpDir: string = getCheckpointDir();
        await mkDirP(chkpDir);
894
        return chkpDir;
QuanluZhang's avatar
QuanluZhang committed
895
    }
J-shang's avatar
J-shang committed
896
897
898
899
900
901
902
903

    public async getTrialOutputLocalPath(trialJobId: string): Promise<string> {
        return this.trainingService.getTrialOutputLocalPath(trialJobId);
    }

    public async fetchTrialOutput(trialJobId: string, subpath: string): Promise<void> {
        return this.trainingService.fetchTrialOutput(trialJobId, subpath);
    }
Deshui Yu's avatar
Deshui Yu committed
904
905
906
}

export { NNIManager };